In [27]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 50)
pd.set_option('display.min_rows', 50)

In [28]:
words_df = pd.read_csv('../words_refined/basic_words.csv')
words_df[['oxford_level', 'oxford_version', 'lexile_grade', 'lexile_category', 'awsl', 'toefl', 'tsl', 'bsl', 'ngsl_freq', 'ngsl_sp_freq', 'naver_priority', 'lemmetized']] = np.nan

words_df["Word"] = words_df["Word"].str.lower()
words_df = words_df.set_index('Word')
words_df = words_df.sort_index()

words_df

Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,awsl,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
abandon,,,,,,,,,,,,
abdomen,,,,,,,,,,,,
able,,,,,,,,,,,,
abnormal,,,,,,,,,,,,
aboard,,,,,,,,,,,,
abolish,,,,,,,,,,,,
abort,,,,,,,,,,,,
about,,,,,,,,,,,,
above,,,,,,,,,,,,
abroad,,,,,,,,,,,,


## Oxford

In [29]:
ox3000 = pd.read_csv('../words_refined/ox_3000_american.csv', index_col='Word')
ox3000
ox5000 = pd.read_csv('../words_refined/ox_5000_american.csv', index_col='Word')
ox5000

ox_df = ox3000.append(ox5000)
ox_df
ox_df = ox_df.rename(columns={'Level': 'oxford_level', 'Version': 'oxford_version'})
ox_df = ox_df[~ox_df.index.duplicated(keep='first')]
ox_df

# add oxford word rows that are not in words_df
words_df = pd.concat([words_df, ox_df[~ox_df.index.isin(words_df.index)]])
# for words that exist already, update oxford details
words_df.update(ox_df)
words_df = words_df.sort_index()

words_df

Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,awsl,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
abandon,B2,3000.0,,,,,,,,,,
abdomen,,,,,,,,,,,,
ability,A2,3000.0,,,,,,,,,,
able,A2,3000.0,,,,,,,,,,
abnormal,,,,,,,,,,,,
aboard,,,,,,,,,,,,
abolish,C1,5000.0,,,,,,,,,,
abort,,,,,,,,,,,,
abortion,C1,5000.0,,,,,,,,,,
about,A1,3000.0,,,,,,,,,,


## Lexile

In [30]:
lex_df = pd.read_csv('../words_refined/lexiles_no_duplicates.csv', index_col='Word')
lex_df = lex_df.rename(columns={'Domain': 'lexile_category', 'Level': 'lexile_grade'})
lex_df.index = lex_df.index.str.lower()
words_df = pd.concat([words_df, lex_df[~lex_df.index.isin(words_df.index)]])
# words_df[words_df.index.duplicated(keep=False)]
words_df.update(lex_df)
words_df.sort_index(inplace=True)

lex_df
words_df

Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,awsl,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
abandon,B2,3000.0,,,,,,,,,,
abbot,,,9.0,social_studies,,,,,,,,
abbreviate,,,9.0,general,,,,,,,,
abbreviation,,,6.0,general,,,,,,,,
abdicate,,,9.0,social_studies,,,,,,,,
abdomen,,,6.0,science,,,,,,,,
abdominal,,,9.0,science,,,,,,,,
aberration,,,9.0,science,,,,,,,,
ability,A2,3000.0,4.0,general,,,,,,,,
abiotic,,,4.0,science,,,,,,,,


## TOEFL

In [31]:
toefl_df = pd.read_csv('../words_refined/toefl.csv', index_col='Word')
toefl_df['toefl'] = 'Y'
toefl_df.index = toefl_df.index.str.lower()
words_df = pd.concat([words_df, toefl_df[~toefl_df.index.isin(words_df.index)]])
words_df.update(toefl_df)
words_df.sort_index(inplace=True)

n = words_df[words_df['toefl'] == 'Y']
words_df

Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,awsl,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
abandon,B2,3000.0,,,,,,,,,,
abbot,,,9.0,social_studies,,,,,,,,
abbreviate,,,9.0,general,,Y,,,,,,
abbreviation,,,6.0,general,,Y,,,,,,
abdicate,,,9.0,social_studies,,,,,,,,
abdomen,,,6.0,science,,Y,,,,,,
abdominal,,,9.0,science,,,,,,,,
aberration,,,9.0,science,,,,,,,,
abigail,,,,,,Y,,,,,,
ability,A2,3000.0,4.0,general,,,,,,,,


## TSL

In [32]:
tsl_df = pd.read_csv('../words_refined/tsl_with_lemmas.csv', index_col='Word')
tsl_df = tsl_df.drop(columns='Lemmas')
tsl_df['tsl'] = 'Y'
tsl_df.index = tsl_df.index.str.lower()

words_df = pd.concat([words_df, tsl_df[~tsl_df.index.isin(words_df.index)]])
words_df.update(tsl_df)
words_df.sort_index(inplace=True)

n = words_df[words_df['tsl'] == 'Y']

tsl_df
words_df

Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,awsl,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
abandon,B2,3000.0,,,,,,,,,,
abbot,,,9.0,social_studies,,,,,,,,
abbreviate,,,9.0,general,,Y,,,,,,
abbreviation,,,6.0,general,,Y,,,,,,
abdicate,,,9.0,social_studies,,,,,,,,
abdomen,,,6.0,science,,Y,,,,,,
abdominal,,,9.0,science,,,,,,,,
aberration,,,9.0,science,,,,,,,,
abide,,,,,,,Y,,,,,
abigail,,,,,,Y,,,,,,


## BSL

In [33]:
bsl_df = pd.read_csv('../words_refined/bsl_with_lemmas.csv', index_col='Word')
bsl_df = bsl_df.drop(columns='Lemmas')
bsl_df['bsl'] = 'Y'
bsl_df.index = bsl_df.index.str.lower()

words_df = pd.concat([words_df, bsl_df[~bsl_df.index.isin(words_df.index)]])
words_df.update(bsl_df)
words_df.sort_index(inplace=True)

n = words_df[words_df['bsl'] == 'Y']

# get rid of NaN
words_df = words_df[words_df.index.notnull()]
bsl_df
words_df


Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,awsl,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
abandon,B2,3000.0,,,,,,,,,,
abbot,,,9.0,social_studies,,,,,,,,
abbreviate,,,9.0,general,,Y,,,,,,
abbreviation,,,6.0,general,,Y,,,,,,
abdicate,,,9.0,social_studies,,,,,,,,
abdomen,,,6.0,science,,Y,,,,,,
abdominal,,,9.0,science,,,,,,,,
aberration,,,9.0,science,,,,,,,,
abide,,,,,,,Y,,,,,
abigail,,,,,,Y,,,,,,


## AWSL

In [34]:
awsl_df = pd.read_csv("../words_refined/awsl_with_lemmas.csv", index_col='Word')
awsl_df = awsl_df.drop(columns="Lemmas")
awsl_df['awsl'] = 'Y'
awsl_df.index = awsl_df.index.str.lower()

awsl_df = awsl_df[~awsl_df.index.duplicated(keep='first')]

words_df = pd.concat([words_df, awsl_df[~awsl_df.index.isin(words_df.index)]])
words_df.update(awsl_df)
words_df.sort_index(inplace=True)

n = words_df[words_df['awsl'] == 'Y']

awsl_df
words_df


Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,awsl,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
a,,,,,Y,,,,,,,
abandon,B2,3000.0,,,,,,,,,,
abbot,,,9.0,social_studies,,,,,,,,
abbreviate,,,9.0,general,,Y,,,,,,
abbreviation,,,6.0,general,,Y,,,,,,
abdicate,,,9.0,social_studies,,,,,,,,
abdomen,,,6.0,science,,Y,,,,,,
abdominal,,,9.0,science,,,,,,,,
aberration,,,9.0,science,,,,,,,,
abide,,,,,,,Y,,,,,


## NGSL

In [35]:
ngsl_df = pd.read_csv('../words_refined/ngsl_freq.csv', index_col='Word')
ngsl_df = ngsl_df.rename(columns={'Frequency': 'ngsl_freq'})
ngsl_df.sort_index(inplace=True)
ngsl_df = ngsl_df[ngsl_df.index.notnull()]
ngsl_df.index = ngsl_df.index.str.lower()

words_df = pd.concat([words_df, ngsl_df[~ngsl_df.index.isin(words_df.index)]])
words_df.update(ngsl_df)
words_df.sort_index(inplace=True)

ngsl_df
words_df


Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,awsl,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
a,,,,,Y,,,,A1,,,
aback,,,,,,,,,B3,,,
abacus,,,,,,,,,C3,,,
abalone,,,,,,,,,C3,,,
abandon,B2,3000.0,,,,,,,A2,,,
abandonment,,,,,,,,,B3,,,
abase,,,,,,,,,C4,,,
abash,,,,,,,,,C4,,,
abate,,,,,,,,,C1,,,
abatement,,,,,,,,,C1,,,


## NGSL-SP

In [36]:
ngsl_sp_df = pd.read_csv('../words_refined/ngsl_sp_freq.csv', index_col='Word')
ngsl_sp_df = ngsl_sp_df.rename(columns={'Frequency': 'ngsl_sp_freq'})
ngsl_sp_df.sort_index(inplace=True)
ngsl_sp_df = ngsl_sp_df[ngsl_sp_df.index.notnull()]
ngsl_sp_df.index = ngsl_sp_df.index.str.lower()

words_df = pd.concat([words_df, ngsl_sp_df[~ngsl_sp_df.index.isin(words_df.index)]])
words_df.update(ngsl_sp_df)
words_df.sort_index(inplace=True)

ngsl_sp_df
words_df


Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,awsl,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2 - sup,,,,,,,,,,C1,,
a,,,,,Y,,,,A1,A1,,
aback,,,,,,,,,B3,C1,,
abacus,,,,,,,,,C3,C3,,
abalone,,,,,,,,,C3,,,
abandon,B2,3000.0,,,,,,,A2,A3,,
abandonment,,,,,,,,,B3,C1,,
abase,,,,,,,,,C4,,,
abash,,,,,,,,,C4,,,
abate,,,,,,,,,C1,C1,,


## NDL

In [None]:
ndl_df = pd.read_csv('../words_refined/bsl_with_lemmas.csv', index_col='Word')
ndl_df = ndl_df.drop(columns='Lemmas')
ndl_df['bsl'] = 'Y'
ndl_df.index = ndl_df.index.str.lower()

words_df = pd.concat([words_df, ndl_df[~ndl_df.index.isin(words_df.index)]])
words_df.update(ndl_df)
words_df.sort_index(inplace=True)

n = words_df[words_df['bsl'] == 'Y']

# get rid of NaN
words_df = words_df[words_df.index.notnull()]
ndl_df
words_df


## Cleanup

In [37]:
# NaN값 채우기
words_df.oxford_level = words_df.oxford_level.fillna('NN')
words_df.oxford_version = words_df.oxford_version.fillna(0).astype(int)
words_df.lexile_grade = words_df.lexile_grade.fillna(0).astype(int)
words_df.lexile_category = words_df.lexile_category.fillna('NN')
words_df.awsl = words_df.awsl.fillna('N')
words_df.toefl = words_df.toefl.fillna('N')
words_df.tsl = words_df.tsl.fillna('N')
words_df.bsl = words_df.bsl.fillna('N')
words_df.ngsl_freq = words_df.ngsl_freq.fillna('NN')
words_df.ngsl_sp_freq = words_df.ngsl_sp_freq.fillna('NN')

# to lower (just in case)
words_df.index = words_df.index.str.lower()
words_df.sort_index(inplace=True)
# delete words containing numbers
words_df = words_df[~words_df.index.str.contains('\d')]

words_df

Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,awsl,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
a,NN,0,0,NN,Y,N,N,N,A1,A1,,
aback,NN,0,0,NN,N,N,N,N,B3,C1,,
abacus,NN,0,0,NN,N,N,N,N,C3,C3,,
abalone,NN,0,0,NN,N,N,N,N,C3,NN,,
abandon,B2,3000,0,NN,N,N,N,N,A2,A3,,
abandonment,NN,0,0,NN,N,N,N,N,B3,C1,,
abase,NN,0,0,NN,N,N,N,N,C4,NN,,
abash,NN,0,0,NN,N,N,N,N,C4,NN,,
abate,NN,0,0,NN,N,N,N,N,C1,C1,,
abatement,NN,0,0,NN,N,N,N,N,C1,C3,,


## Special Words (Compound, Characters)

### Add Compound Words

In [38]:
# words with symbols
ex = words_df[words_df.index.str.contains("[^a-zA-Z]")]
ex

entire_lemmas = pd.read_csv('../final_datasets/entire_lemmas_complete.csv', index_col='Word')


# spaced, dashed words => unify to hyphen
spaced = words_df[words_df.index.str.contains(" ", regex=False)]
dashed = words_df[words_df.index.str.contains("-")]
compounds = spaced.append(dashed)
compounds

# ice cream
compounds.at['ice-cream', 'tsl'] = 'Y'
compounds.at['ice-cream', 'oxford_level'] = 'A1'
compounds.at['ice-cream', 'oxford_version'] = 3000
compounds = compounds.drop('ice cream')
compounds.sort_index()

# For all compound words, unify to hyphenated
def hyphenate(word):
  split = word.split()
  if len(split) == 2:
      hyphen_word = f'{split[0]}-{split[1]}'
      return hyphen_word
  return word
compounds.index = compounds.index.map(hyphenate)
compounds

# Do the same for actual words list
words_df.at['ice-cream', 'tsl'] = 'Y'
words_df.at['ice-cream', 'oxford_level'] = 'A1'
words_df.at['ice-cream', 'oxford_version'] = 3000
words_df = words_df.drop('ice cream')
words_df.sort_index()
words_df.index = words_df.index.map(hyphenate)
words_df

# For all compound words (now hyphenated),
# add to compounds lemma list & entire lemma list
compound_lemmas = pd.DataFrame()
compound_lemmas['Word'] = compounds.index

def addCompoundLemmas(word):
  split = word.split('-')
  if len(split) == 2:
    lemmas = f'{split[0]}{split[1]};{split[0]} {split[1]};{split[0]}{split[1]}s;{split[0]} {split[1]}s'
  if len(split) == 3:
    lemmas = f'{split[0]}{split[1]}{split[2]};{split[0]} {split[1]} {split[2]};{split[0]}{split[1]}{split[2]}s;{split[0]} {split[1]} {split[2]}s'
  return lemmas

compound_lemmas['Lemmas'] = compound_lemmas['Word']
compound_lemmas['Lemmas'] = compound_lemmas['Lemmas'].apply(addCompoundLemmas)
compound_lemmas.set_index('Word', inplace=True)
compound_lemmas.sort_index(inplace=True)
compound_lemmas
compound_lemmas.rename(columns={'Lemmas': 'compound_lemmas'}, inplace=True)

# for each word in compounds, find word in entire lemmas
# if word exists, add lemmas to lemmas (check duplicate)
# if not, add word and lemmas

entire_lemmas = entire_lemmas.append(compound_lemmas)
entire_lemmas = entire_lemmas.sum(level=0)
entire_lemmas = entire_lemmas.replace([0], '')
entire_lemmas['Lemmas'] = entire_lemmas['Lemmas'] + ';' + entire_lemmas['compound_lemmas'] 
entire_lemmas = entire_lemmas.drop('compound_lemmas', axis=1)
entire_lemmas.Lemmas = entire_lemmas.Lemmas.str.strip(';').str.replace(',', ';')
entire_lemmas

compound_lemmas

# entire_lemmas.loc[['t-shirt', 'ice-cream']]
# entire_lemmas[entire_lemmas.index.isin(compound_lemmas.index)]
# words_df.loc['no one']

# compound_lemmas.to_csv('../final_datasets/compound_lemmas.csv')
# entire_lemmas.to_csv('../final_datasets/entire_lemmas_compounds.csv')
# words_df.to_csv("../final_datasets/words_data_hyphenated.csv")


  entire_lemmas = entire_lemmas.sum(level=0)


Unnamed: 0_level_0,compound_lemmas
Word,Unnamed: 1_level_1
by-law,bylaw;by law;bylaws;by laws
decision-making,decisionmaking;decision making;decisionmakings...
e-book,ebook;e book;ebooks;e books
et-al,etal;et al;etals;et als
full-time,fulltime;full time;fulltimes;full times
high-profile,highprofile;high profile;highprofiles;high pro...
ice-cream,icecream;ice cream;icecreams;ice creams
in-law,inlaw;in law;inlaws;in laws
large-scale,largescale;large scale;largescales;large scales
long-standing,longstanding;long standing;longstandings;long ...


## Delete proper nouns (states)

In [39]:
states = pd.read_csv("../etc_data/us_states.csv", index_col='State')
states.index = states.index.str.lower()

words_df = words_df[~words_df.index.isin(states.index)]
# words_df.loc['abigail']

# words_df.to_csv("../final_datasets/words_data_compounds_nostates.csv")


## Create Existing Lemmas List

In [40]:
all_lemmas = pd.read_csv('../final_datasets/entire_lemmas_complete_compounds.csv', index_col='Word')
all_lemmas

existing_lemmas = all_lemmas[all_lemmas.index.isin(words_df.index)]
existing_lemmas.sort_index(inplace=True)

# existing_lemmas.to_csv("../final_datasets/existing_lemmas_final.csv")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().sort_index(


## Lemmetized

In [41]:
words_df['lemmetized'] = words_df.index.isin(existing_lemmas.index)

words_df
# words_df[words_df['lemmetized'] == True]
# words_df.to_csv("../final_datasets/words_df_final.csv")

Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,awsl,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
a,NN,0,0,NN,Y,N,N,N,A1,A1,,True
aback,NN,0,0,NN,N,N,N,N,B3,C1,,False
abacus,NN,0,0,NN,N,N,N,N,C3,C3,,True
abalone,NN,0,0,NN,N,N,N,N,C3,NN,,True
abandon,B2,3000,0,NN,N,N,N,N,A2,A3,,True
abandonment,NN,0,0,NN,N,N,N,N,B3,C1,,True
abase,NN,0,0,NN,N,N,N,N,C4,NN,,True
abash,NN,0,0,NN,N,N,N,N,C4,NN,,True
abate,NN,0,0,NN,N,N,N,N,C1,C1,,True
abatement,NN,0,0,NN,N,N,N,N,C1,C3,,True
