# Detecting the English level of a movie by it's subtitles

### Research goals 
- Practising on a real textual data
- Discover basics of the NLP
- Sharpen building prediction models skill

### Research progress
- Data acquisition and initial analysis
- Check for duplicates and gaps, correct where it's possible
- Check data for anomalies
- Correlation analysis
- Data preprocessing and cleaning
- Train and asses models

## Data acquisition and initial analysis

In [1]:
import pandas as pd
import numpy as np
import chardet  as cdt
import os
import pysrt
import nltk
import pickle

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

RANDOM_VAL = 12345
subs_path = './data/Subtitles_all'

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/artem.pochechuev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/artem.pochechuev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/artem.pochechuev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df_dict = pd.read_csv('data/oxford_cefr.csv', encoding = "latin")
df_dict = df_dict.drop_duplicates()
display(df_dict)
print(df_dict['level'].value_counts())
df_dict = dict(df_dict.values)

Unnamed: 0,word,level
0,abandon,b2
1,ability,a2
2,able,a2
3,abolish,c1
4,abortion,c1
...,...,...
5328,yours,a2
5329,yourself,a1
5330,youth,b1
5331,zero,a2


b2    1425
c1    1312
a1     891
a2     865
b1     804
Name: level, dtype: int64


In [3]:
df = pd.read_excel('data/movies_labels.xlsx')
df.drop('id', axis=1, inplace=True)
df = df.rename(columns=str.lower)
display(df)
print(df['level'].value_counts())

Unnamed: 0,movie,level
0,10_Cloverfield_lane(2016),B1
1,10_things_I_hate_about_you(1999),B1
2,A_knights_tale(2001),B2
3,A_star_is_born(2018),B2
4,Aladdin(1992),A2/A2+
...,...,...
236,Matilda(2022),C1
237,Bullet train,B1
238,Thor: love and thunder,B2
239,Lightyear,B2


B2            101
B1             55
C1             40
A2/A2+         26
B1, B2          8
A2              6
A2/A2+, B1      5
Name: level, dtype: int64


In [4]:
df['path'] = subs_path + '/Subtitles/' + df['movie'] + '.srt'

for dir in os.listdir(subs_path):
    if not dir.__contains__('.DS_Store') and not dir.__contains__('Subtitles'):
        movies_arr = os.listdir(subs_path+'/'+dir)
        rows = pd.DataFrame({'movie' : movies_arr, 'level': np.full(len(movies_arr), dir), 'path': [f"{subs_path}/{dir}/{mvi}" for mvi in movies_arr]})
        df = pd.concat([df, rows], ignore_index = True)
        df.reset_index(drop=True, inplace=True)

display(df)

Unnamed: 0,movie,level,path
0,10_Cloverfield_lane(2016),B1,./data/Subtitles_all/Subtitles/10_Cloverfield_...
1,10_things_I_hate_about_you(1999),B1,./data/Subtitles_all/Subtitles/10_things_I_hat...
2,A_knights_tale(2001),B2,./data/Subtitles_all/Subtitles/A_knights_tale(...
3,A_star_is_born(2018),B2,./data/Subtitles_all/Subtitles/A_star_is_born(...
4,Aladdin(1992),A2/A2+,./data/Subtitles_all/Subtitles/Aladdin(1992).srt
...,...,...,...
399,Indiana Jones And The Last Crusade DVDRip Xvid...,B1,./data/Subtitles_all/B1/Indiana Jones And The ...
400,Seven.Worlds.One.Planet.S01E04.2160p.BluRay.Re...,B1,./data/Subtitles_all/B1/Seven.Worlds.One.Plane...
401,Seven.Worlds.One.Planet.S01E03.2160p.BluRay.Re...,B1,./data/Subtitles_all/B1/Seven.Worlds.One.Plane...
402,Angelas.Christmas.Wish.2020.srt,B1,./data/Subtitles_all/B1/Angelas.Christmas.Wish...


In [5]:
df.drop_duplicates(inplace=True)

df.loc[df['level'] == 'A2/A2+', 'level'] = 'A2'
df.loc[df['level'] == 'B1, B2', 'level'] = 'B2'
df.loc[df['level'] == 'A2/A2+, B1', 'level'] = 'B1'

display(df['level'].value_counts())

B2    216
B1     75
C1     73
A2     38
Name: level, dtype: int64

In [6]:
def proc_text(file_path):
    result_words = []

    with open(file_path, 'rb') as sub_file:
        file_content = sub_file.read()

    encoding = cdt.detect(file_content).get('encoding')

    subs = pysrt.open(file_path, encoding)
    lemmatizer = WordNetLemmatizer()

    for sub in subs:
        result_text = BeautifulSoup(sub.text.lower(), "lxml").text
        stop_words = set(stopwords.words('english'))
        words = word_tokenize(result_text)
        words = [word for word in words if word not in stop_words]
        words = [word for word in words if word.isalpha()]
        result_words.extend([lemmatizer.lemmatize(word) for word in words])

    return result_words

In [7]:
def get_sub_data(file_path, level, name, columns):
    if not os.path.exists(file_path):
        raise Exception("Subtitile does not exist")

    res_stat = dict.fromkeys(columns, 0)
    res_stat['name'] = name
    res_stat['level'] = level
    
    result_words = proc_text(file_path)

    for word in result_words:
        if word not in df_dict:
            res_stat['unknown'] += 1
        else:
            res_stat[df_dict.get(word)] += 1

    res_stat['text'] = ' '.join(result_words)

    if(os.path.exists('dump.txt')):
        os.remove('dump.txt')

    with open('dump.txt', 'a') as f:
       f.write(f'subtitles from: {file_path}\n')
       f.write(res_stat['text'])
       f.write('\n\n')

    return res_stat

In [8]:
columns = list(set(df_dict.values()))
columns.extend(['unknown', 'text', 'encoding', 'name'])

rows_list = []

for idx, row in df.iterrows():
    if os.path.exists(row['path']):
        rows_list.append(get_sub_data(df.at[idx, 'path'], df.at[idx, 'level'], df.at[idx, 'movie'], columns=columns))

rows_res_df = pd.DataFrame(rows_list)
display(rows_res_df)

  result_text = BeautifulSoup(sub.text.lower(), "lxml").text


Unnamed: 0,b1,b2,a2,c1,a1,unknown,text,encoding,name,level
0,229,185,295,75,802,1038,fixed synced bozxphd enjoy flick clanging draw...,0,10_Cloverfield_lane(2016),B1
1,348,283,444,126,1378,1082,hey right cameron go nine school year army bra...,0,10_things_I_hate_about_you(1999),B1
2,304,327,465,162,1069,1121,resync xenzai nef retail help due list two min...,0,A_knights_tale(2001),B2
3,449,430,696,162,2542,1597,synced corrected mrcjnthn get black eye open w...,0,A_star_is_born(2018),B2
4,474,376,598,147,1196,1469,oh come land faraway place caravan camel roam ...,0,Aladdin(1992),A2
...,...,...,...,...,...,...,...,...,...,...
266,232,311,424,103,1117,1234,dismount herman chap one wander passageway run...,0,Indiana Jones And The Last Crusade DVDRip Xvid...,B1
267,153,136,211,64,398,407,australia island continent cast adrift time di...,0,Seven.Worlds.One.Planet.S01E04.2160p.BluRay.Re...,B1
268,155,132,212,47,340,305,southern tip south america andes mountain rise...,0,Seven.Worlds.One.Planet.S01E03.2160p.BluRay.Re...,B1
269,92,76,202,20,563,427,going come angela mind sheep dada want hear so...,0,Angelas.Christmas.Wish.2020.srt,B1


In [9]:
models2train = [
    LinearSVC(max_iter=3000, random_state=RANDOM_VAL),
    CatBoostClassifier(random_state=RANDOM_VAL, silent=True),
    RandomForestClassifier(n_estimators=15, random_state=RANDOM_VAL),
    LogisticRegression(random_state=RANDOM_VAL, solver='lbfgs', max_iter=10000),
    RidgeClassifier(random_state=RANDOM_VAL, max_iter=10000),
    SGDClassifier(random_state=RANDOM_VAL, max_iter=10000)
]

In [10]:
def print_metrics(header, ppln, X_train, X_test, Y_train, Y_test):
    ppln.fit(X_train, Y_train)
    predicted = ppln.predict(X_test)
    
    print(f'{header} results:')    
    print('confusion_matrix:\n', confusion_matrix(Y_test, predicted))
    print('classification_report:\n', classification_report(Y_test, predicted))
    print('accuracy_score:', accuracy_score(Y_test, predicted))
    print('f1_score_macro:', f1_score(Y_test, predicted, average='macro'), 'f1_score_micro:', f1_score(Y_test, predicted, average='micro'))
    print()

## Method 1 - predicting CEFR level by number of leveled words in text

In [11]:
features_leveled = rows_res_df.drop(['level', 'text', 'encoding', 'name'], axis=1)
target = rows_res_df['level']

features_train_lvl, features_test_lvl, target_train_lvl, target_test_lvl = train_test_split(features_leveled, target, train_size=0.2, random_state=RANDOM_VAL)

In [12]:
scaler = StandardScaler()
scaler.fit(features_train_lvl)

features_train_lvl = scaler.transform(features_train_lvl)
features_test_lvl = scaler.transform(features_test_lvl)
print("=========== CEFR level by number of leveled words in text ===========")

for model in models2train:
    print_metrics(type(model).__name__, model, features_train_lvl, features_test_lvl, target_train_lvl, target_test_lvl)

LinearSVC results:
confusion_matrix:
 [[ 2 12 11  0]
 [ 4 20 16  0]
 [ 8 28 85  0]
 [ 1  5 25  0]]
classification_report:
               precision    recall  f1-score   support

          A2       0.13      0.08      0.10        25
          B1       0.31      0.50      0.38        40
          B2       0.62      0.70      0.66       121
          C1       0.00      0.00      0.00        31

    accuracy                           0.49       217
   macro avg       0.27      0.32      0.28       217
weighted avg       0.42      0.49      0.45       217

accuracy_score: 0.4930875576036866
f1_score_macro: 0.28496677740863785 f1_score_micro: 0.4930875576036866



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


CatBoostClassifier results:
confusion_matrix:
 [[ 1 16  6  2]
 [ 4 18 17  1]
 [ 2 32 68 19]
 [ 2  1 16 12]]
classification_report:
               precision    recall  f1-score   support

          A2       0.11      0.04      0.06        25
          B1       0.27      0.45      0.34        40
          B2       0.64      0.56      0.60       121
          C1       0.35      0.39      0.37        31

    accuracy                           0.46       217
   macro avg       0.34      0.36      0.34       217
weighted avg       0.47      0.46      0.45       217

accuracy_score: 0.45622119815668205
f1_score_macro: 0.3402485312108876 f1_score_micro: 0.45622119815668205

RandomForestClassifier results:
confusion_matrix:
 [[ 5 11  8  1]
 [ 5 16 18  1]
 [ 5 30 64 22]
 [ 2  2 15 12]]
classification_report:
               precision    recall  f1-score   support

          A2       0.29      0.20      0.24        25
          B1       0.27      0.40      0.32        40
          B2       0.61   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Method 2 - predicting CEFR level by text embedings

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(rows_res_df['text'], target, train_size=0.2, random_state=RANDOM_VAL, stratify=target)
vectorizer = TfidfVectorizer(max_features=1500, min_df=4, max_df=0.38)

In [14]:
for model in models2train:
    ppln = Pipeline([
        ('vectorizer', vectorizer),
        ('model', model)
    ])
    print_metrics(type(model).__name__, ppln, X_train, X_test, Y_train, Y_test)

LinearSVC results:
confusion_matrix:
 [[ 4 12  9  0]
 [ 3 21 20  2]
 [ 7 12 94  2]
 [ 0  1 26  4]]
classification_report:
               precision    recall  f1-score   support

          A2       0.29      0.16      0.21        25
          B1       0.46      0.46      0.46        46
          B2       0.63      0.82      0.71       115
          C1       0.50      0.13      0.21        31

    accuracy                           0.57       217
   macro avg       0.47      0.39      0.39       217
weighted avg       0.54      0.57      0.53       217

accuracy_score: 0.5668202764976958
f1_score_macro: 0.3947248403770143 f1_score_micro: 0.5668202764976958

CatBoostClassifier results:
confusion_matrix:
 [[  0   2  23   0]
 [  0  13  33   0]
 [  1   5 109   0]
 [  0   1  30   0]]
classification_report:
               precision    recall  f1-score   support

          A2       0.00      0.00      0.00        25
          B1       0.62      0.28      0.39        46
          B2       0.56  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


RandomForestClassifier results:
confusion_matrix:
 [[  0   5  20   0]
 [  2  15  29   0]
 [  2   5 107   1]
 [  0   0  27   4]]
classification_report:
               precision    recall  f1-score   support

          A2       0.00      0.00      0.00        25
          B1       0.60      0.33      0.42        46
          B2       0.58      0.93      0.72       115
          C1       0.80      0.13      0.22        31

    accuracy                           0.58       217
   macro avg       0.50      0.35      0.34       217
weighted avg       0.55      0.58      0.50       217

accuracy_score: 0.5806451612903226
f1_score_macro: 0.3407195597147389 f1_score_micro: 0.5806451612903226

LogisticRegression results:
confusion_matrix:
 [[  0   0  25   0]
 [  0   2  44   0]
 [  0   1 114   0]
 [  0   0  28   3]]
classification_report:
               precision    recall  f1-score   support

          A2       0.00      0.00      0.00        25
          B1       0.67      0.04      0.08       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


RidgeClassifier results:
confusion_matrix:
 [[  3   7  15   0]
 [  3  15  28   0]
 [  3   8 104   0]
 [  0   1  26   4]]
classification_report:
               precision    recall  f1-score   support

          A2       0.33      0.12      0.18        25
          B1       0.48      0.33      0.39        46
          B2       0.60      0.90      0.72       115
          C1       1.00      0.13      0.23        31

    accuracy                           0.58       217
   macro avg       0.60      0.37      0.38       217
weighted avg       0.60      0.58      0.52       217

accuracy_score: 0.5806451612903226
f1_score_macro: 0.37921865715983366 f1_score_micro: 0.5806451612903226

SGDClassifier results:
confusion_matrix:
 [[ 4 11  9  1]
 [ 2 30 11  3]
 [ 2 18 86  9]
 [ 0  1 23  7]]
classification_report:
               precision    recall  f1-score   support

          A2       0.50      0.16      0.24        25
          B1       0.50      0.65      0.57        46
          B2       0.67

In [15]:
ppln = Pipeline([
    ('vectorizer', vectorizer),
    ('model', SGDClassifier(random_state=RANDOM_VAL, max_iter=10000))
])

ppln.fit(X_train, Y_train)

with open('models/sgd_model.pcl', 'wb') as f:
    pickle.dump(ppln, f)

In [36]:
for idx, movie in rows_res_df.iterrows():
    print('predicted:', ppln.predict([movie['text']])[0], 'real level:', movie['level'], 'name:', movie['name'])

predicted: C1 real level: B1 name: 10_Cloverfield_lane(2016)
predicted: B1 real level: B1 name: 10_things_I_hate_about_you(1999)
predicted: B2 real level: B2 name: A_knights_tale(2001)
predicted: B1 real level: B2 name: A_star_is_born(2018)
predicted: B1 real level: A2 name: Aladdin(1992)
predicted: B2 real level: A2 name: All_dogs_go_to_heaven(1989)
predicted: B1 real level: A2 name: An_American_tail(1986)
predicted: A2 real level: A2 name: Babe(1995)
predicted: B2 real level: A2 name: Back_to_the_future(1985)
predicted: B2 real level: C1 name: Banking_On_Bitcoin(2016)
predicted: B2 real level: A2 name: Batman_begins(2005)
predicted: B2 real level: B2 name: Beauty_and_the_beast(2017)
predicted: B2 real level: B2 name: Before_I_go_to_sleep(2014)
predicted: B1 real level: B2 name: Before_sunrise(1995)
predicted: B1 real level: B2 name: Before_sunset(2004)
predicted: B2 real level: B2 name: Braveheart(1995)
predicted: C1 real level: B2 name: Bridget_Jones_diary(2001)
predicted: C1 real l