## 라이브러리

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['font.family'] = 'AppleGothic'
# plt.rcParams['font.family'] = 'NanumGothic'

## path 설정

In [2]:
path = '../..'
script_path = f'{path}/script'
word_path = f'{path}/words/final_datasets'

## 결과 데이터프레임 생성

In [3]:
if os.path.exists(f'{script_path}/csv_files/all_scripts_word_level_counts.csv'):
    df_result = pd.read_csv(f'{script_path}/csv_files/all_scripts_word_level_counts.csv', index_col='title')
else:
    df_result = pd.DataFrame(columns=['title', 'level_1', 'level_2', 'level_3', 'level_4', 'level_5', 'level_6']).set_index('title')

df_result

Unnamed: 0_level_0,level_1,level_2,level_3,level_4,level_5,level_6
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13TH,3732,833,606,115,44,44
About.Time,3220,317,257,47,27,14
Arthur.Christmas,2293,228,240,106,49,39
Baby.Driver,2849,246,351,77,21,43
Bad.Boys,3707,344,392,94,57,40
Bee.Movie,2940,421,379,76,30,62
Ben.Platt.Live.from.Radio.City.Music.Hall,1157,76,78,12,7,14
Bo.Burnham_.Inside,983,117,70,11,5,8
Bo.Burnham_.Make.Happy,1165,119,137,22,21,20
Chasing.Coral,2526,300,439,54,21,25


## 자막(단어) 파일 불러오기

### 경로 설정

In [30]:
# 다큐멘터리(0), 드라마(1), 영화(2) 선택
category = 1
if category == 0:
    script_path_cat = script_path + '/Documentary'
elif category == 1:
    script_path_cat = script_path + '/Drama'
else:
    script_path_cat = script_path + '/Movie'

_list = os.listdir(script_path_cat)

file_list = []
for file in _list:
    if file.startswith('.'): continue
    file_list.append(file)

file_list = sorted(file_list)
print(f'작품개수: {len(file_list)}')
count = 0
for index, title in enumerate(file_list):
    print(f'[{index}: {title}]', end=', ')
    count += 1
    if count%5 == 0:
        print()

작품개수: 14
[0: Anne.with.an.E], [1: Better.Call.Saul], [2: Black.Mirror], [3: Breaking.Bad], [4: Disenchantment], 
[5: Friends], [6: How.to.Get.Away.With.Murder], [7: MINDHUNTER], [8: Narcos], [9: Peaky.Blinders], 
[10: Rick.and.Morty], [11: Sherlock], [12: Stranger.Things], [13: The.Good.Doctor], 

### 작품 선택

In [31]:
# 바로 위 코드에서 file_list를 참고하여 작품 선택
title = file_list[13]
# if (df_result.index == title).any(): print(f'{title} is already exists')
file_name = 'unique_words_' + title + '.WEBRip.Netflix.en[cc]'
print(f'{script_path_cat}/{title}/{file_name}')

../../script/Drama/The.Good.Doctor/unique_words_The.Good.Doctor.WEBRip.Netflix.en[cc]


## unique_words_작품.csv to DataFrame

In [32]:
counts_df = pd.read_csv(f'{script_path_cat}/{title}/{file_name}.csv', index_col='word')
display(counts_df.info(), counts_df)

<class 'pandas.core.frame.DataFrame'>
Index: 3811 entries, know to zoo
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   counts  3811 non-null   int64
dtypes: int64(1)
memory usage: 59.5+ KB


None

Unnamed: 0_level_0,counts
word,Unnamed: 1_level_1
know,223
can,195
just,185
need,146
gon,133
...,...
pack,1
dose,1
doubt,1
doubting,1


## 변형형을 원형으로 변환 후 다시 세기

In [33]:
# 단어 리스트 불러오기
pd.options.display.float_format = '{:.0f}'.format

word_db_df = pd.read_csv(f'{word_path}/words_df_word_level_final.csv')
word_db_df
word_db_df.set_index('Word', inplace=True)

#변형어 리스트 불러오기
existing_lemmas = pd.read_csv(f'{word_path}/existing_lemmas_final.csv', index_col='Word')
lemmas_dict = {}
for index, row in existing_lemmas.iterrows():
    lemmas = row["Lemmas"].split(';')
    for lemma in lemmas:
        lemmas_dict[lemma] = str(index)

def convertToHeadForm(word):
    word = str(word)
    head = lemmas_dict.get(word)
    # If a lemma word, change to head word in word list
    if head:
        return head
    if word in word_db_df.index:
        return word
    else:
        # Check if adverb
        if word.endswith('ly'):
            if word[:-2] in word_db_df.index:
                print(word[:-2])
                return word[:-2]
            if word.endswith('ily'):
                if (word[:-3] + 'y') in word_db_df.index:
                    print(word[:-3] + 'y')
                    return word[:-3] + 'y'
            if word.endswith('ally'):
                if (word[:-4]) in word_db_df.index:
                    print(word[:-4])
                    return word[:-4]
            if (word[:-2] + 'e') in word_db_df.index:
                print(word[:-2] + 'e')
                return word[:-2] + 'e'
        return word

counts_df.index = counts_df.index.map(convertToHeadForm)
counts_df

counts_df_headed = counts_df.groupby(counts_df.index).sum()
counts_df_headed.sort_values('counts', ascending=False, inplace=True)

counts_df_headed
# counts_df_headed.to_csv('test-about-time.csv')

operative


Unnamed: 0_level_0,counts
word,Unnamed: 1_level_1
can,395
know,248
go,199
need,192
just,185
...,...
phrase,1
photographic,1
photogenic,1
phony,1


## word_level 컬럼 추가

In [34]:
counts_df_headed_joined = counts_df_headed.join(word_db_df[['word_level']])

counts_df_headed_joined

Unnamed: 0_level_0,counts,word_level
word,Unnamed: 1_level_1,Unnamed: 2_level_1
can,395,1
know,248,1
go,199,1
need,192,1
just,185,1
...,...,...
phrase,1,1
photographic,1,3
photogenic,1,6
phony,1,4


## Group by word_level

In [67]:
# # import nltk
# # nltk.download('wordnet')
# # nltk.download('words')
# from nltk.corpus import wordnet, words

# test = counts_df_headed_joined[counts_df_headed_joined['word_level'].isnull()]
# test = test.reset_index()
# test['synsets'] = test['word'].apply(lambda x: True if len(wordnet.synsets(x))>0 else False)
# # test['words.words'] = test['word'].apply(lambda x: True if x in words.words else False)
# test.head()
    

Unnamed: 0,word,counts,word_level,synsets
0,gon,133,,False
1,scut,8,,True
2,jared,8,,True
3,sats,7,,True
4,meds,7,,True


In [68]:
# test['words_words'] = test['word'].apply(lambda x: True if x in words.words() else False)
# test.head()

Unnamed: 0,word,counts,word_level,synsets,words_words
0,gon,133,,False,True
1,scut,8,,True,True
2,jared,8,,True,False
3,sats,7,,True,False
4,meds,7,,True,False


In [9]:
result = counts_df_headed_joined.groupby('word_level').sum('counts').transpose()
result['title'] = title
result = result.set_index('title')
result = result.rename(columns={1: 'level_1', 2: 'level_2', 3: 'level_3', 4: 'level_4', 5: 'level_5', 6: 'level_6'})
result

word_level,level_1,level_2,level_3,level_4,level_5,level_6
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
About.Time,3220,317,257,47,27,14


## df_result에 추가

In [10]:
df_result = pd.concat([df_result, result])
df_result

Unnamed: 0_level_0,level_1,level_2,level_3,level_4,level_5,level_6
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13TH,3732,833,606,115,44,44
About.Time,3220,317,257,47,27,14
Arthur.Christmas,2293,228,240,106,49,39
Baby.Driver,2849,246,351,77,21,43
Bad.Boys,3707,344,392,94,57,40
Bee.Movie,2940,421,379,76,30,62
Ben.Platt.Live.from.Radio.City.Music.Hall,1157,76,78,12,7,14
Bo.Burnham_.Inside,983,117,70,11,5,8
Bo.Burnham_.Make.Happy,1165,119,137,22,21,20
Chasing.Coral,2526,300,439,54,21,25


# 위의 코드들 합쳐서 한 번에 모든 작품 돌리기

In [21]:
import os
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
# import matplotlib as mpl
# import warnings
# warnings.filterwarnings('ignore')

# plt.rcParams['font.family'] = 'AppleGothic'
# plt.rcParams['font.family'] = 'NanumGothic'

path = '../..'
script_path = f'{path}/script'
word_path = f'{path}/words/final_datasets'

if os.path.exists(f'{script_path}/csv_files/all_scripts_word_level_counts.csv'):
    df_result = pd.read_csv(f'{script_path}/csv_files/all_scripts_word_level_counts.csv', index_col='title')
else:
    df_result = pd.DataFrame(columns=['title', 'level_1', 'level_2', 'level_3', 'level_4', 'level_5', 'level_6']).set_index('title')

display(df_result)

# 다큐멘터리(0), 드라마(1), 영화(2) 선택
category = 2
if category == 0:
    script_path_cat = script_path + '/Documentary'
elif category == 1:
    script_path_cat = script_path + '/Drama'
else:
    script_path_cat = script_path + '/Movie'

_list = os.listdir(script_path_cat)

file_list = []
for file in _list:
    if file.startswith('.'): continue
    file_list.append(file)

file_list = sorted(file_list)
print(f'작품개수: {len(file_list)}')
count = 0
for index, title in enumerate(file_list):
    print(f'[{index}: {title}]', end=', ')
    count += 1
    if count%5 == 0:
        print()

Unnamed: 0_level_0,level_1,level_2,level_3,level_4,level_5,level_6,running_time,WPS_mean,WPS_std,WPS_min,WPS_max,calculated,final_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1


작품개수: 41
[0: About.Time], [1: Arthur.Christmas], [2: Baby.Driver], [3: Bad.Boys], [4: Bee.Movie], 
[5: Ben.Platt.Live.from.Radio.City.Music.Hall], [6: Constantine], [7: Dolittle], [8: Dracula.Untold], [9: Enola.Holmes], 
[10: How.to.Be.Single], [11: In.Time], [12: Inception], [13: Interstellar], [14: It_s.Complicated], 
[15: Jumanji_.Welcome.to.the.Jungle], [16: Kill.Bill_.Vol..1], [17: Klaus], [18: Kung.Fu.Panda], [19: Kung.Fu.Panda.2], 
[20: Kung.Fu.Panda.3], [21: Madagascar_.Escape.2.Africa], [22: Matilda], [23: Monty.Python.and.the.Holy.Grail], [24: Morning.Glory], 
[25: Notting.Hill], [26: Passengers], [27: Penguins.of.Madagascar_.The.Movie], [28: Prisoners], [29: Second.Act], 
[30: The.Accountant], [31: The.Boy.in.the.Striped.Pajamas], [32: The.Dark.Knight], [33: The.Departed], [34: The.Family.Man], 
[35: The.Intern], [36: The.Judge], [37: The.Revenant], [38: The.Truman.Show], [39: There.Will.Be.Blood], 
[40: Zodiac], 

In [91]:
for i in range(len(file_list)):
    # 바로 위 코드에서 file_list를 참고하여 작품 선택
    title = file_list[i]
    if (df_result.index == title).any(): continue

    file_name = 'unique_words_' + title + '.WEBRip.Netflix.en[cc]'
    # print(f'{script_path_cat}/{title}/{file_name}')

    counts_df = pd.read_csv(f'{script_path_cat}/{title}/{file_name}.csv', index_col='word')

    # 단어 리스트 불러오기
    pd.options.display.float_format = '{:.0f}'.format

    word_db_df = pd.read_csv(f'{word_path}/words_df_word_level_final.csv')
    word_db_df.set_index('Word', inplace=True)

    #변형어 리스트 불러오기
    existing_lemmas = pd.read_csv(f'{word_path}/existing_lemmas_final.csv', index_col='Word')
    lemmas_dict = {}
    for index, row in existing_lemmas.iterrows():
        lemmas = row["Lemmas"].split(';')
        for lemma in lemmas:
            lemmas_dict[lemma] = str(index)

    def convertToHeadForm(word):
        word = str(word)
        head = lemmas_dict.get(word)
        # If a lemma word, change to head word in word list
        if head:
            return head
        if word in word_db_df.index:
            return word
        else:
            # Check if adverb
            if word.endswith('ly'):
                if word[:-2] in word_db_df.index:
                    # print(word[:-2])
                    return word[:-2]
                if word.endswith('ily'):
                    if (word[:-3] + 'y') in word_db_df.index:
                        # print(word[:-3] + 'y')
                        return word[:-3] + 'y'
                if word.endswith('ally'):
                    if (word[:-4]) in word_db_df.index:
                        # print(word[:-4])
                        return word[:-4]
                if (word[:-2] + 'e') in word_db_df.index:
                    # print(word[:-2] + 'e')
                    return word[:-2] + 'e'
            return word

    counts_df.index = counts_df.index.map(convertToHeadForm)

    counts_df_headed = counts_df.groupby(counts_df.index).sum()
    counts_df_headed.sort_values('counts', ascending=False, inplace=True)

    counts_df_headed_joined = counts_df_headed.join(word_db_df[['word_level']])

    result = counts_df_headed_joined.groupby('word_level').sum('counts').transpose()
    result['title'] = title
    result = result.set_index('title')
    result = result.rename(columns={1: 'level_1', 2: 'level_2', 3: 'level_3', 4: 'level_4', 5: 'level_5', 6: 'level_6'})

    df_result = pd.concat([df_result, result])

df_result

Unnamed: 0_level_0,level_1,level_2,level_3,level_4,level_5,level_6,running_time,WPS,calculated,final_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
About.Time,645,201,104,59,24,13,,,,
Arthur.Christmas,564,175,104,54,40,28,,,,
Baby.Driver,604,165,122,46,23,24,,,,
Bad.Boys,613,197,108,41,35,29,,,,
Bee.Movie,654,230,156,70,28,43,,,,
Ben.Platt.Live.from.Radio.City.Music.Hall,355,67,37,18,9,7,,,,
Constantine,498,143,89,40,18,16,,,,
Dolittle,586,217,109,55,25,24,,,,
Dracula.Untold,371,124,65,37,6,2,,,,
Enola.Holmes,627,223,117,51,26,21,,,,


## csv파일로 변환

In [1]:
df_result = df_result.sort_index()
df_result.to_csv(f'{script_path}/csv_files/all_scripts_word_level_counts.csv')

NameError: name 'df_result' is not defined