In [1]:
import pandas as pd
from itertools import chain
from lemminflect import getAllInflections
from tqdm import tqdm

In [2]:
with open("src-tauri/src/new_short_dictionary.txt", "r") as f:
    words = {w.strip() for w in f.readlines()}
original_words = words.copy()

In [3]:
country_df = pd.read_csv("countries.csv")

In [4]:
countries = set()
for n in country_df["Name"]:
    countries.update(c.strip().upper().replace("(", "").replace(")", "") for c in n.split(" "))

In [5]:
for real in ("CENTRAL", "TURKEY", "CHINA", "CHILE", "GUINEA", "EAST", "CAPE", "EQUATORIAL", "IVORY", "NEW", "UNITED", "SOUTH",
             "MARSHALL", "SAINT", "AND", "COAST", "COOK", "HEARD", "ISLAND", "ISLANDS", "ISLE", "MAN", "MINOR", "OCEAN", "OF",
             "OUTLYING", "REPUBLIC", "SOUTHERN", "STATES", "TERRITORIES", "TERRITORY", "TOME", "UNION", "VIRGIN", "THE",
             "NORTH", "NORTHERN", "KINGDOM", "DEMOCRATIC", "EMIRATES", "FEDERATION", "REUNION", "SANDWICH", "WESTERN", "JERSEY"):
    words.add(real)
    countries.discard(real)
countries

{'AFGHANISTAN',
 'AFRICA',
 'AFRICAN',
 'ALBANIA',
 'ALGERIA',
 'AMERICA',
 'AMERICAN',
 'ANDORRA',
 'ANGOLA',
 'ANGUILLA',
 'ANTARCTICA',
 'ANTIGUA',
 'ARAB',
 'ARABIA',
 'ARGENTINA',
 'ARMENIA',
 'ARUBA',
 'AUSTRALIA',
 'AUSTRIA',
 'AZERBAIJAN',
 'BAHAMAS',
 'BAHRAIN',
 'BANGLADESH',
 'BARBADOS',
 'BARBUDA',
 'BARTHÉLEMY',
 'BELARUS',
 'BELGIUM',
 'BELIZE',
 'BENIN',
 'BERMUDA',
 'BHUTAN',
 'BOLIVIA',
 'BOSNIA',
 'BOTSWANA',
 'BOUVET',
 'BRAZIL',
 'BRITISH',
 'BRUNEI',
 'BULGARIA',
 'BURKINA',
 'BURMA',
 'BURUNDI',
 'CAICOS',
 'CALEDONIA',
 'CAMBODIA',
 'CAMEROON',
 'CANADA',
 'CARIBBEAN',
 'CAYMAN',
 'CHAD',
 'CHRISTMAS',
 'COCOS',
 'COLOMBIA',
 'COMOROS',
 'CONGO',
 'COSTA',
 'CROATIA',
 'CUBA',
 'CURAÇAO',
 'CYPRUS',
 'CZECH',
 'CÔTE',
 "D'IVOIRE",
 'DENMARK',
 'DJIBOUTI',
 'DOMINICA',
 'DOMINICAN',
 'ECUADOR',
 'EGYPT',
 'EL',
 'ERITREA',
 'ESTONIA',
 'ETHIOPIA',
 'FALKLAND',
 'FAROE',
 'FASO',
 'FIJI',
 'FINLAND',
 'FRANCE',
 'FRENCH',
 'FUTUNA',
 'GABON',
 'GAMBIA',
 'GEORGIA',

In [6]:
for c in countries:
    words.discard(c)

In [7]:
original_words - words

{'AFRICAN',
 'AMERICAN',
 'BOLIVIA',
 'BRAZIL',
 'BRITISH',
 'CAYMAN',
 'CHAD',
 'COCOS',
 'CONGO',
 'COSTA',
 'CYPRUS',
 'FRENCH',
 'GAMBIA',
 'GRENADINES',
 'GUERNSEY',
 'HONG',
 'INDIA',
 'INDIAN',
 'JAPAN',
 'JORDAN',
 'KEELING',
 'LEONE',
 'MARTIN',
 'MOROCCO',
 'PANAMA',
 'RUSSIAN',
 'SIERRA',
 'TONGA',
 'TURKS'}

In [8]:
demonyms = set()
for d in chain(country_df["Demonym 1"], country_df["Demonym 2"], country_df["Demonym 3"]):
    if pd.notnull(d):
        demonyms.update(dd.strip().upper() for dd in d.split(" "))

In [9]:
for real in ("CENTRAL", "TURKEY", "CHINA", "CHILE", "GUINEA", "EAST", "CAPE", "EQUATORIAL", "IVORY", "NEW", "UNITED", "SOUTH",
             "MARSHALL", "SAINT", "AND", "COAST", "COOK", "HEARD", "ISLAND", "ISLANDS", "ISLE", "MAN", "MINOR", "OCEAN", "OF",
             "OUTLYING", "REPUBLIC", "SOUTHERN", "STATES", "TERRITORIES", "TERRITORY", "TOME", "UNION", "VIRGIN", "THE",
             "NORTH", "NORTHERN", "KINGDOM", "DEMOCRATIC", "EMIRATES", "FEDERATION", "REUNION", "SANDWICH", "WESTERN", "JERSEY",
             "ISLANDER", "POLE"):
    demonyms.discard(real)
demonyms

{'AFGHAN',
 'AFGHANI',
 'AFRICAN',
 'ALABANIAN',
 'ALBANIAN',
 'ALGERIAN',
 'AMERICAN',
 'ANDORRAN',
 'ANGOLAN',
 'ANGUILLAN',
 'ANTARCTIC',
 'ANTIGUAN',
 'ARABIAN',
 'ARGENTINE',
 'ARGENTINEAN',
 'ARGENTINIAN',
 'ARMENIAN',
 'ARUBIAN',
 'AUSTRALIAN',
 'AUSTRIAN',
 'AZERBAIJANI',
 'BAHAMEESE',
 'BAHAMIAN',
 'BAHRAINIAN',
 'BAJAN',
 'BANGLADESHI',
 'BARBADAN',
 'BARBADIAN',
 'BARTHÉLEMOIS',
 'BASOTHO',
 'BATSWANA',
 'BELARUSIAN',
 'BELGIAN',
 'BELIZEAN',
 'BENINESE',
 'BERMUDAN',
 'BHUTANESE',
 'BOLIVIAN',
 'BOSNIAN',
 'BRAZILIAN',
 'BRITISH',
 'BRUNEIAN',
 'BULGARIAN',
 'BURKINABE',
 'BURMESE',
 'BURUNDIAN',
 'CAICOS',
 'CALEDONIAN',
 'CALEDONIANS',
 'CAMBODIAN',
 'CAMEROONIAN',
 'CANADIAN',
 'CAYMANIAN',
 'CHADIAN',
 'CHILEAN',
 'CHINESE',
 'CHRISTMAS',
 'COCOS',
 'COCOSSIAN',
 'COLOMBIAN',
 'COLUMBIAN',
 'COMORAN',
 'CONGOLESE',
 'COSTA',
 'CROAT',
 'CROATIAN',
 'CUBAN',
 'CURAÇAOAN',
 'CYPRIOT',
 'CZECH',
 'DANE',
 'DANISH',
 'DJIBOUTI',
 'DJIBOUTIAN',
 'DOMINICAN',
 'DUTCH',
 'ECUA

In [10]:
for d in demonyms:
    words.discard(d)

In [11]:
original_words - words

{'AFGHAN',
 'AFRICAN',
 'AMERICAN',
 'ANTARCTIC',
 'ARGENTINE',
 'BOLIVIA',
 'BRAZIL',
 'BRITISH',
 'CAYMAN',
 'CHAD',
 'CHINESE',
 'COCOS',
 'CONGO',
 'COSTA',
 'CYPRUS',
 'DANISH',
 'DUTCH',
 'EGYPTIAN',
 'FRENCH',
 'GAMBIA',
 'GERMAN',
 'GREEK',
 'GRENADINES',
 'GUERNSEY',
 'HONG',
 'INDIA',
 'INDIAN',
 'ITALIAN',
 'JAPAN',
 'JORDAN',
 'KEELING',
 'LEONE',
 'MARTIN',
 'MOROCCO',
 'PANAMA',
 'POLISH',
 'RUSSIAN',
 'SIERRA',
 'SPANISH',
 'SWISS',
 'TONGA',
 'TURKISH',
 'TURKS'}

In [12]:
for w in tqdm(words.copy()):
    inflections = getAllInflections(w)
    words.update(chain.from_iterable(inflections.values()))

100%|█████████████████████████████████████████████████████████████████████████| 18208/18208 [00:00<00:00, 33850.10it/s]


In [17]:
max(len(w) for w in words), len(words)

(17, 30522)

In [16]:
with open("src-tauri/src/updated_short_dictionary.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(sorted(words, key=len, reverse=True)))