In [587]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 50)
pd.set_option('display.min_rows', 50)

In [588]:
words_df = pd.read_csv('basic_words.csv')
words_df[['oxford_level', 'oxford_version', 'lexile_grade', 'lexile_category', 'toefl', 'tsl', 'bsl', 'ngsl_freq', 'ngsl_sp_freq', 'naver_priority', 'lemmetized']] = np.nan

words_df["Word"] = words_df["Word"].str.lower()
words_df = words_df.set_index('Word')
words_df = words_df.sort_index()

words_df

Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
abandon,,,,,,,,,,,
abdomen,,,,,,,,,,,
able,,,,,,,,,,,
abnormal,,,,,,,,,,,
aboard,,,,,,,,,,,
abolish,,,,,,,,,,,
abort,,,,,,,,,,,
about,,,,,,,,,,,
above,,,,,,,,,,,
abroad,,,,,,,,,,,


## Oxford

In [589]:
ox_df = pd.read_csv('words_refined/all_oxford.csv', index_col='Word')
ox_df.index = ox_df.index.str.lower()
# add oxford word rows that are not in words_df
words_df = pd.concat([words_df, ox_df[~ox_df.index.isin(words_df.index)]])
# for words that exist already, update oxford details
words_df.update(ox_df)
words_df = words_df.sort_index()

words_df

Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
a,A1,3000.0,,,,,,,,,
abandon,B2,3000.0,,,,,,,,,
abdomen,,,,,,,,,,,
ability,A2,3000.0,,,,,,,,,
able,,,,,,,,,,,
abnormal,,,,,,,,,,,
aboard,,,,,,,,,,,
abolish,C1,5000.0,,,,,,,,,
abort,,,,,,,,,,,
about,A1,3000.0,,,,,,,,,


## Lexile

In [590]:
lex_df = pd.read_csv('words_refined/lexiles_no_duplicates.csv', index_col='Word')
lex_df = lex_df.rename(columns={'Domain': 'lexile_category', 'Level': 'lexile_grade'})
lex_df.index = lex_df.index.str.lower()
words_df = pd.concat([words_df, lex_df[~lex_df.index.isin(words_df.index)]])
# words_df[words_df.index.duplicated(keep=False)]
words_df.update(lex_df)
words_df.sort_index(inplace=True)

lex_df
words_df

Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
a,A1,3000.0,,,,,,,,,
abandon,B2,3000.0,,,,,,,,,
abbot,,,9.0,social_studies,,,,,,,
abbreviate,,,9.0,general,,,,,,,
abbreviation,,,6.0,general,,,,,,,
abdicate,,,9.0,social_studies,,,,,,,
abdomen,,,6.0,science,,,,,,,
abdominal,,,9.0,science,,,,,,,
aberration,,,9.0,science,,,,,,,
ability,A2,3000.0,4.0,general,,,,,,,


## TOEFL

In [591]:
toefl_df = pd.read_csv('words_refined/toefl.csv', index_col='Word')
toefl_df['toefl'] = 'Y'
toefl_df.index = toefl_df.index.str.lower()
words_df = pd.concat([words_df, toefl_df[~toefl_df.index.isin(words_df.index)]])
words_df.update(toefl_df)
words_df.sort_index(inplace=True)

n = words_df[words_df['toefl'] == 'Y']
words_df

Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
a,A1,3000.0,,,,,,,,,
abandon,B2,3000.0,,,,,,,,,
abbot,,,9.0,social_studies,,,,,,,
abbreviate,,,9.0,general,Y,,,,,,
abbreviation,,,6.0,general,Y,,,,,,
abdicate,,,9.0,social_studies,,,,,,,
abdomen,,,6.0,science,Y,,,,,,
abdominal,,,9.0,science,,,,,,,
aberration,,,9.0,science,,,,,,,
abigail,,,,,Y,,,,,,


## TSL

In [592]:
tsl_df = pd.read_csv('words_refined/tsl_with_lemmas.csv', index_col='Word')
tsl_df = tsl_df.drop(columns='Lemmas')
tsl_df['tsl'] = 'Y'
tsl_df.index = tsl_df.index.str.lower()

words_df = pd.concat([words_df, tsl_df[~tsl_df.index.isin(words_df.index)]])
words_df.update(tsl_df)
words_df.sort_index(inplace=True)

n = words_df[words_df['tsl'] == 'Y']

tsl_df
words_df

Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
a,A1,3000.0,,,,,,,,,
abandon,B2,3000.0,,,,,,,,,
abbot,,,9.0,social_studies,,,,,,,
abbreviate,,,9.0,general,Y,,,,,,
abbreviation,,,6.0,general,Y,,,,,,
abdicate,,,9.0,social_studies,,,,,,,
abdomen,,,6.0,science,Y,,,,,,
abdominal,,,9.0,science,,,,,,,
aberration,,,9.0,science,,,,,,,
abide,,,,,,Y,,,,,


## BSL

In [593]:
bsl_df = pd.read_csv('words_refined/bsl_with_lemmas.csv', index_col='Word')
bsl_df = bsl_df.drop(columns='Lemmas')
bsl_df['bsl'] = 'Y'
bsl_df.index = bsl_df.index.str.lower()

words_df = pd.concat([words_df, bsl_df[~bsl_df.index.isin(words_df.index)]])
words_df.update(bsl_df)
words_df.sort_index(inplace=True)

n = words_df[words_df['bsl'] == 'Y']

# get rid of NaN
words_df = words_df[words_df.index.notnull()]
bsl_df
words_df


Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
a,A1,3000.0,,,,,,,,,
abandon,B2,3000.0,,,,,,,,,
abbot,,,9.0,social_studies,,,,,,,
abbreviate,,,9.0,general,Y,,,,,,
abbreviation,,,6.0,general,Y,,,,,,
abdicate,,,9.0,social_studies,,,,,,,
abdomen,,,6.0,science,Y,,,,,,
abdominal,,,9.0,science,,,,,,,
aberration,,,9.0,science,,,,,,,
abide,,,,,,Y,,,,,


## NGSL

In [594]:
ngsl_df = pd.read_csv('words_refined/ngsl_freq.csv', index_col='Word')
ngsl_df = ngsl_df.rename(columns={'Frequency': 'ngsl_freq'})
ngsl_df.sort_index(inplace=True)
ngsl_df = ngsl_df[ngsl_df.index.notnull()]
ngsl_df.index = ngsl_df.index.str.lower()

words_df = pd.concat([words_df, ngsl_df[~ngsl_df.index.isin(words_df.index)]])
words_df.update(ngsl_df)
words_df.sort_index(inplace=True)

ngsl_df
words_df


Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
a,A1,3000.0,,,,,,A1,,,
aback,,,,,,,,B3,,,
abacus,,,,,,,,C3,,,
abalone,,,,,,,,C3,,,
abandon,B2,3000.0,,,,,,A2,,,
abandonment,,,,,,,,B3,,,
abase,,,,,,,,C4,,,
abash,,,,,,,,C4,,,
abate,,,,,,,,C1,,,
abatement,,,,,,,,C1,,,


## NGSL-SP

In [595]:
ngsl_sp_df = pd.read_csv('words_refined/ngsl_sp_freq.csv', index_col='Word')
ngsl_sp_df = ngsl_sp_df.rename(columns={'Frequency': 'ngsl_sp_freq'})
ngsl_sp_df.sort_index(inplace=True)
ngsl_sp_df = ngsl_sp_df[ngsl_sp_df.index.notnull()]
ngsl_sp_df.index = ngsl_sp_df.index.str.lower()

words_df = pd.concat([words_df, ngsl_sp_df[~ngsl_sp_df.index.isin(words_df.index)]])
words_df.update(ngsl_sp_df)
words_df.sort_index(inplace=True)

ngsl_sp_df
words_df


Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2 - sup,,,,,,,,,C1,,
a,A1,3000.0,,,,,,A1,A1,,
aback,,,,,,,,B3,C1,,
abacus,,,,,,,,C3,C3,,
abalone,,,,,,,,C3,,,
abandon,B2,3000.0,,,,,,A2,A3,,
abandonment,,,,,,,,B3,C1,,
abase,,,,,,,,C4,,,
abash,,,,,,,,C4,,,
abate,,,,,,,,C1,C1,,


## Cleanup

In [596]:
# NaN값 채우기
words_df.oxford_level = words_df.oxford_level.fillna('NN')
words_df.oxford_version = words_df.oxford_version.fillna(0).astype(int)
words_df.lexile_grade = words_df.lexile_grade.fillna(0).astype(int)
words_df.lexile_category = words_df.lexile_category.fillna('NN')
words_df.toefl = words_df.toefl.fillna('N')
words_df.tsl = words_df.tsl.fillna('N')
words_df.bsl = words_df.bsl.fillna('N')
words_df.ngsl_freq = words_df.ngsl_freq.fillna('NN')
words_df.ngsl_sp_freq = words_df.ngsl_sp_freq.fillna('NN')

# to lower (just in case)
words_df.index = words_df.index.str.lower()
words_df.sort_index(inplace=True)

words_df
# words_df2

Unnamed: 0_level_0,oxford_level,oxford_version,lexile_grade,lexile_category,toefl,tsl,bsl,ngsl_freq,ngsl_sp_freq,naver_priority,lemmetized
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2 - sup,NN,0,0,NN,N,N,N,NN,C1,,
a,A1,3000,0,NN,N,N,N,A1,A1,,
aback,NN,0,0,NN,N,N,N,B3,C1,,
abacus,NN,0,0,NN,N,N,N,C3,C3,,
abalone,NN,0,0,NN,N,N,N,C3,NN,,
abandon,B2,3000,0,NN,N,N,N,A2,A3,,
abandonment,NN,0,0,NN,N,N,N,B3,C1,,
abase,NN,0,0,NN,N,N,N,C4,NN,,
abash,NN,0,0,NN,N,N,N,C4,NN,,
abate,NN,0,0,NN,N,N,N,C1,C1,,


## Export

In [597]:
# words_df.to_csv("words_data.csv")