## 라이브러리

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['font.family'] = 'AppleGothic'
# plt.rcParams['font.family'] = 'NanumGothic'

## path 설정

In [16]:
path = '../..'
script_path = f'{path}/script'
word_path = f'{path}/words/final_datasets'

## 결과 데이터프레임 생성

In [23]:
if os.path.exists(f'{word_path}/all_scripts_word_level_counts.csv'):
    df_result = pd.read_csv(f'{word_path}/all_scripts_word_level_counts.csv', index_col='title')
else:
    df_result = pd.DataFrame(columns=['title', 'level_1', 'level_2', 'level_3', 'level_4', 'level_5', 'level_6']).set_index('title')

df_result

Unnamed: 0_level_0,level_1,level_2,level_3,level_4,level_5,level_6,running_time,WPS_mean,WPS_std,WPS_min,WPS_max,calculated,final_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
About.Time,645,201,104,59,24,13,,,,,,,
Arthur.Christmas,564,175,104,54,40,28,,,,,,,
Baby.Driver,604,165,122,46,23,24,,,,,,,
Bad.Boys,613,197,108,41,35,29,,,,,,,
Bee.Movie,654,230,156,70,28,43,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Seaspiracy,814,335,237,50,43,28,,,,,,,
The.Social.Dilemma,758,327,184,41,42,31,,,,,,,
Three.Identical.Strangers,650,195,108,33,26,18,,,,,,,
Virunga,526,157,74,21,11,7,,,,,,,


## 자막(단어) 파일 불러오기

### 경로 설정

In [13]:
# 다큐멘터리(0), 드라마(1), 영화(2) 선택
category = 2
if category == 0:
    script_path_cat = script_path + '/Documentary'
elif category == 1:
    script_path_cat = script_path + '/Drama'
else:
    script_path_cat = script_path + '/Movie'

_list = os.listdir(script_path_cat)

file_list = []
for file in _list:
    if file.startswith('.'): continue
    file_list.append(file)

file_list = sorted(file_list)
print(f'작품개수: {len(file_list)}')
count = 0
for index, title in enumerate(file_list):
    print(f'[{index}: {title}]', end=', ')
    count += 1
    if count%5 == 0:
        print()

작품개수: 41
[0: About.Time], [1: Arthur.Christmas], [2: Baby.Driver], [3: Bad.Boys], [4: Bee.Movie], 
[5: Ben.Platt.Live.from.Radio.City.Music.Hall], [6: Constantine], [7: Dolittle], [8: Dracula.Untold], [9: Enola.Holmes], 
[10: How.to.Be.Single], [11: In.Time], [12: Inception], [13: Interstellar], [14: It_s.Complicated], 
[15: Jumanji_.Welcome.to.the.Jungle], [16: Kill.Bill_.Vol..1], [17: Klaus], [18: Kung.Fu.Panda], [19: Kung.Fu.Panda.2], 
[20: Kung.Fu.Panda.3], [21: Madagascar_.Escape.2.Africa], [22: Matilda], [23: Monty.Python.and.the.Holy.Grail], [24: Morning.Glory], 
[25: Notting.Hill], [26: Passengers], [27: Penguins.of.Madagascar_.The.Movie], [28: Prisoners], [29: Second.Act], 
[30: The.Accountant], [31: The.Boy.in.the.Striped.Pajamas], [32: The.Dark.Knight], [33: The.Departed], [34: The.Family.Man], 
[35: The.Intern], [36: The.Judge], [37: The.Revenant], [38: The.Truman.Show], [39: There.Will.Be.Blood], 
[40: Zodiac], 

### 작품 선택

In [14]:
# 바로 위 코드에서 file_list를 참고하여 작품 선택
title = file_list[0]
if (df_result.index == title).any(): print(f'{title} is already exists')
file_name = 'unique_words_' + title + '.WEBRip.Netflix.en[cc]'
print(f'{script_path_cat}/{title}/{file_name}')

../../script/Movie/About.Time/unique_words_About.Time.WEBRip.Netflix.en[cc]


## unique_words_작품.csv to DataFrame

In [15]:
counts_df = pd.read_csv(f'{script_path_cat}/{title}/{file_name}.csv', index_col='word')
display(counts_df.info(), counts_df)

<class 'pandas.core.frame.DataFrame'>
Index: 1304 entries, just to zip
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   counts  1304 non-null   int64
dtypes: int64(1)
memory usage: 20.4+ KB


None

Unnamed: 0_level_0,counts
word,Unnamed: 1_level_1
just,66
so,62
very,44
know,43
think,38
...,...
impeccably,1
impressive,1
incredibly,1
indifference,1


## 변형형을 원형으로 변환 후 다시 세기

In [5]:
# 단어 리스트 불러오기
pd.options.display.float_format = '{:.0f}'.format

word_db_df = pd.read_csv(f'{word_path}/words_df_word_level_final.csv')
word_db_df
word_db_df.set_index('Word', inplace=True)

#변형어 리스트 불러오기
existing_lemmas = pd.read_csv(f'{word_path}/existing_lemmas_final.csv', index_col='Word')
lemmas_dict = {}
for index, row in existing_lemmas.iterrows():
    lemmas = row["Lemmas"].split(';')
    for lemma in lemmas:
        lemmas_dict[lemma] = str(index)

def convertToHeadForm(word):
    word = str(word)
    head = lemmas_dict.get(word)
    # If a lemma word, change to head word in word list
    if head:
        return head
    if word in word_db_df.index:
        return word
    else:
        # Check if adverb
        if word.endswith('ly'):
            if word[:-2] in word_db_df.index:
                print(word[:-2])
                return word[:-2]
            if word.endswith('ily'):
                if (word[:-3] + 'y') in word_db_df.index:
                    print(word[:-3] + 'y')
                    return word[:-3] + 'y'
            if word.endswith('ally'):
                if (word[:-4]) in word_db_df.index:
                    print(word[:-4])
                    return word[:-4]
            if (word[:-2] + 'e') in word_db_df.index:
                print(word[:-2] + 'e')
                return word[:-2] + 'e'
        return word

counts_df.index = counts_df.index.map(convertToHeadForm)
counts_df

counts_df_headed = counts_df.groupby(counts_df.index).sum()
counts_df_headed.sort_values('counts', ascending=False, inplace=True)

counts_df_headed
# counts_df_headed.to_csv('test-about-time.csv')

NameError: name 'counts_df' is not defined

## word_level 컬럼 추가

In [81]:
counts_df_headed_joined = counts_df_headed.join(word_db_df[['word_level']])

counts_df_headed_joined

Unnamed: 0_level_0,counts,word_level
word,Unnamed: 1_level_1,Unnamed: 2_level_1
just,66,1
so,62,1
can,62,1
think,54,1
good,54,1
...,...,...
illustrious,1,4
imagination,1,2
imagine,1,1
immediate,1,2


## Group by word_level

In [82]:
result = counts_df_headed_joined.value_counts('word_level').to_frame().transpose()
result['title'] = title
result = result.set_index('title')
result = result.rename(columns={1: 'level_1', 2: 'level_2', 3: 'level_3', 4: 'level_4', 5: 'level_5', 6: 'level_6'})
result

word_level,level_1,level_2,level_3,level_4,level_5,level_6
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
About.Time,645,201,104,59,24,13


## df_result에 추가

In [83]:
df_result = pd.concat([df_result, result])
df_result

Unnamed: 0_level_0,level_1,level_2,level_3,level_4,level_5,level_6,running_time,WPS,calculated,final_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
About.Time,645,201,104,59,24,13,,,,


# 위의 코드들 합쳐서 한 번에 모든 작품 돌리기

In [21]:
import os
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
# import matplotlib as mpl
# import warnings
# warnings.filterwarnings('ignore')

# plt.rcParams['font.family'] = 'AppleGothic'
# plt.rcParams['font.family'] = 'NanumGothic'

path = '../..'
script_path = f'{path}/script'
word_path = f'{path}/words/final_datasets'

if os.path.exists(f'{word_path}/_word_level_counts.csv'):
    df_result = pd.read_csv(f'{word_path}/_word_level_counts.csv', index_col='title')
else:
    df_result = pd.DataFrame(columns=['title', 'level_1', 'level_2', 'level_3', 'level_4', 'level_5', 'level_6']).set_index('title')

display(df_result)

# 다큐멘터리(0), 드라마(1), 영화(2) 선택
category = 2
if category == 0:
    script_path_cat = script_path + '/Documentary'
elif category == 1:
    script_path_cat = script_path + '/Drama'
else:
    script_path_cat = script_path + '/Movie'

_list = os.listdir(script_path_cat)

file_list = []
for file in _list:
    if file.startswith('.'): continue
    file_list.append(file)

file_list = sorted(file_list)
print(f'작품개수: {len(file_list)}')
count = 0
for index, title in enumerate(file_list):
    print(f'[{index}: {title}]', end=', ')
    count += 1
    if count%5 == 0:
        print()

Unnamed: 0_level_0,level_1,level_2,level_3,level_4,level_5,level_6,running_time,WPS_mean,WPS_std,WPS_min,WPS_max,calculated,final_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1


작품개수: 41
[0: About.Time], [1: Arthur.Christmas], [2: Baby.Driver], [3: Bad.Boys], [4: Bee.Movie], 
[5: Ben.Platt.Live.from.Radio.City.Music.Hall], [6: Constantine], [7: Dolittle], [8: Dracula.Untold], [9: Enola.Holmes], 
[10: How.to.Be.Single], [11: In.Time], [12: Inception], [13: Interstellar], [14: It_s.Complicated], 
[15: Jumanji_.Welcome.to.the.Jungle], [16: Kill.Bill_.Vol..1], [17: Klaus], [18: Kung.Fu.Panda], [19: Kung.Fu.Panda.2], 
[20: Kung.Fu.Panda.3], [21: Madagascar_.Escape.2.Africa], [22: Matilda], [23: Monty.Python.and.the.Holy.Grail], [24: Morning.Glory], 
[25: Notting.Hill], [26: Passengers], [27: Penguins.of.Madagascar_.The.Movie], [28: Prisoners], [29: Second.Act], 
[30: The.Accountant], [31: The.Boy.in.the.Striped.Pajamas], [32: The.Dark.Knight], [33: The.Departed], [34: The.Family.Man], 
[35: The.Intern], [36: The.Judge], [37: The.Revenant], [38: The.Truman.Show], [39: There.Will.Be.Blood], 
[40: Zodiac], 

In [91]:
for i in range(len(file_list)):
    # 바로 위 코드에서 file_list를 참고하여 작품 선택
    title = file_list[i]
    if (df_result.index == title).any(): continue

    file_name = 'unique_words_' + title + '.WEBRip.Netflix.en[cc]'
    # print(f'{script_path_cat}/{title}/{file_name}')

    counts_df = pd.read_csv(f'{script_path_cat}/{title}/{file_name}.csv', index_col='word')

    # 단어 리스트 불러오기
    pd.options.display.float_format = '{:.0f}'.format

    word_db_df = pd.read_csv(f'{word_path}/words_df_word_level_final.csv')
    word_db_df.set_index('Word', inplace=True)

    #변형어 리스트 불러오기
    existing_lemmas = pd.read_csv(f'{word_path}/existing_lemmas_final.csv', index_col='Word')
    lemmas_dict = {}
    for index, row in existing_lemmas.iterrows():
        lemmas = row["Lemmas"].split(';')
        for lemma in lemmas:
            lemmas_dict[lemma] = str(index)

    def convertToHeadForm(word):
        word = str(word)
        head = lemmas_dict.get(word)
        # If a lemma word, change to head word in word list
        if head:
            return head
        if word in word_db_df.index:
            return word
        else:
            # Check if adverb
            if word.endswith('ly'):
                if word[:-2] in word_db_df.index:
                    # print(word[:-2])
                    return word[:-2]
                if word.endswith('ily'):
                    if (word[:-3] + 'y') in word_db_df.index:
                        # print(word[:-3] + 'y')
                        return word[:-3] + 'y'
                if word.endswith('ally'):
                    if (word[:-4]) in word_db_df.index:
                        # print(word[:-4])
                        return word[:-4]
                if (word[:-2] + 'e') in word_db_df.index:
                    # print(word[:-2] + 'e')
                    return word[:-2] + 'e'
            return word

    counts_df.index = counts_df.index.map(convertToHeadForm)

    counts_df_headed = counts_df.groupby(counts_df.index).sum()
    counts_df_headed.sort_values('counts', ascending=False, inplace=True)

    counts_df_headed_joined = counts_df_headed.join(word_db_df[['word_level']])

    result = counts_df_headed_joined.value_counts('word_level').to_frame().transpose()
    result['title'] = title
    result = result.set_index('title')
    result = result.rename(columns={1: 'level_1', 2: 'level_2', 3: 'level_3', 4: 'level_4', 5: 'level_5', 6: 'level_6'})

    df_result = pd.concat([df_result, result])

df_result

Unnamed: 0_level_0,level_1,level_2,level_3,level_4,level_5,level_6,running_time,WPS,calculated,final_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
About.Time,645,201,104,59,24,13,,,,
Arthur.Christmas,564,175,104,54,40,28,,,,
Baby.Driver,604,165,122,46,23,24,,,,
Bad.Boys,613,197,108,41,35,29,,,,
Bee.Movie,654,230,156,70,28,43,,,,
Ben.Platt.Live.from.Radio.City.Music.Hall,355,67,37,18,9,7,,,,
Constantine,498,143,89,40,18,16,,,,
Dolittle,586,217,109,55,25,24,,,,
Dracula.Untold,371,124,65,37,6,2,,,,
Enola.Holmes,627,223,117,51,26,21,,,,


## csv파일로 변환

In [1]:
df_result = df_result.sort_index()
df_result.to_csv(f'{word_path}/all_scripts_word_level_counts.csv')

NameError: name 'df_result' is not defined