In [1]:
!git clone https://github.com/wangluheng328/DS301-NLP-Project.git

Cloning into 'DS301-NLP-Project'...
remote: Enumerating objects: 24680, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 24680 (delta 3), reused 12 (delta 3), pack-reused 24668[K
Receiving objects: 100% (24680/24680), 215.95 MiB | 11.14 MiB/s, done.
Resolving deltas: 100% (8827/8827), done.
Checking out files: 100% (2581/2581), done.


In [2]:
import glob
import os

root_path = os.path.expanduser('/content/DS301-NLP-Project/scriptbase_alpha')
paths = glob.glob(root_path + '/*')

In [3]:
import pandas as pd

df = pd.DataFrame([], columns = ['title', 'text', 'imdb'])


for path in paths:
  f = open(path + '/script.txt', 'r')
  try:
    text = f.read()
    imdb_dir = path + '/processed/imdb_meta.txt'
    imdb_f = open(imdb_dir, 'r')
    imdb_text = imdb_f.readlines()
    imdb_score = float(imdb_text[1].split('\t')[1].strip())
    df = df.append(pd.Series([path.split('/')[-1], text, imdb_score], index = ['title', 'text', 'imdb']), ignore_index = True)
  except:
    print('Due to decoding issues, ommited: ' + path)

Due to decoding issues, ommited: /content/DS301-NLP-Project/scriptbase_alpha/Stranglehold (1931 film)
Due to decoding issues, ommited: /content/DS301-NLP-Project/scriptbase_alpha/From Russia with Love (film)
Due to decoding issues, ommited: /content/DS301-NLP-Project/scriptbase_alpha/Goldfinger (film)
Due to decoding issues, ommited: /content/DS301-NLP-Project/scriptbase_alpha/Boy Who Never Slept
Due to decoding issues, ommited: /content/DS301-NLP-Project/scriptbase_alpha/The Jolson Story


In [4]:
df.head()

Unnamed: 0,title,text,imdb
0,Hudson Hawk,"\n\n ""Hudson Hawk"", by Steven E. de Souza, r...",5.7
1,Willard (2003 film),WILLARD\n\n\n\n\n\nBODY { background-co...,6.2
2,Extract (film),\n\n \n EXT...,6.2
3,Beavis and Butt-head Do America,Beavis and Butt-Head Do America - by Mike Judg...,6.7
4,Solarbabies,Solarbabies\n\nTranscribed by: Sonja Kemp\n\n\...,4.7


In [5]:
import nltk
from nltk.text import Text
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

nltk.download('wordnet')
nltk.download('stopwords')
stopwords = stopwords.words('english')
stopwords.remove('not')
stopwords.append('\n')
stopwords.append('<b>')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
def process_text(text):
    clean_text = text
    clean_text = clean_text.lower()
    tokens = word_tokenize(clean_text)

    clean_tokens = []

    for token in tokens:
      token = lemmatizer.lemmatize(token)
      if token not in stopwords and token.isalpha():
        clean_tokens.append(lemmatizer.lemmatize(token))
    return clean_tokens

In [7]:
processed = []
for i in tqdm(range(len(df['text']))):
  processed.append(process_text(df['text'][i]))
df['text_preprocessed'] = processed

100%|██████████| 1271/1271 [11:33<00:00,  1.83it/s]


In [8]:
from collections import Counter

count_list = []
for l in df.text_preprocessed:
  count_list += l
top_10k_count = Counter(count_list).most_common(10000)
top_10k = [i[0] for i in top_10k_count]

In [9]:
top_10k[:10]

['look', 'back', 'int', 'not', 'get', 'see', 'one', 'wa', 'know', 'like']

In [10]:
dicts = []
for processed_list in df.text_preprocessed:
  dicts.append(dict(Counter(processed_list)))
df['counts'] = dicts
df.head()

Unnamed: 0,title,text,imdb,text_preprocessed,counts
0,Hudson Hawk,"\n\n ""Hudson Hawk"", by Steven E. de Souza, r...",5.7,"[hudson, hawk, steven, de, souza, revised, dan...","{'hudson': 28, 'hawk': 663, 'steven': 2, 'de':..."
1,Willard (2003 film),WILLARD\n\n\n\n\n\nBODY { background-co...,6.2,"[willard, body, ffffff, courier, new, helvetic...","{'willard': 934, 'body': 21, 'ffffff': 1, 'cou..."
2,Extract (film),\n\n \n EXT...,6.2,"[extract, written, mike, judge, march, begin, ...","{'extract': 33, 'written': 1, 'mike': 1, 'judg..."
3,Beavis and Butt-head Do America,Beavis and Butt-Head Do America - by Mike Judg...,6.7,"[beavis, america, mike, judge, joe, stillman, ...","{'beavis': 332, 'america': 8, 'mike': 2, 'judg..."
4,Solarbabies,Solarbabies\n\nTranscribed by: Sonja Kemp\n\n\...,4.7,"[solarbabies, transcribed, sonja, kemp, warden...","{'solarbabies': 24, 'transcribed': 1, 'sonja':..."


In [11]:
appeared_in = {}
for word in top_10k:
  count = 0
  for dic in df['counts']:
    if word in dic:
      count += 1
  appeared_in[word] = count

In [12]:
import math

tfidf = []
for dic in df.counts:
  # 10k-dimesional tfidf list for the current document
  tfidf_list = []
  for word in top_10k:
    tf = dic.get(word, 0)
    idf = math.log(len(df) / appeared_in[word])
    tfidf_list.append(tf * idf)
  tfidf.append(tfidf_list)

df['tfidf'] = tfidf

In [13]:
df.head()

Unnamed: 0,title,text,imdb,text_preprocessed,counts,tfidf
0,Hudson Hawk,"\n\n ""Hudson Hawk"", by Steven E. de Souza, r...",5.7,"[hudson, hawk, steven, de, souza, revised, dan...","{'hudson': 28, 'hawk': 663, 'steven': 2, 'de':...","[0.041715862051301136, 0.0, 7.335567800218206,..."
1,Willard (2003 film),WILLARD\n\n\n\n\n\nBODY { background-co...,6.2,"[willard, body, ffffff, courier, new, helvetic...","{'willard': 934, 'body': 21, 'ffffff': 1, 'cou...","[0.07083825631353023, 0.0, 20.362524410950535,..."
2,Extract (film),\n\n \n EXT...,6.2,"[extract, written, mike, judge, march, begin, ...","{'extract': 33, 'written': 1, 'mike': 1, 'judg...","[0.09523810015485731, 0.0, 8.094419641620089, ..."
3,Beavis and Butt-head Do America,Beavis and Butt-Head Do America - by Mike Judg...,6.7,"[beavis, america, mike, judge, joe, stillman, ...","{'beavis': 332, 'america': 8, 'mike': 2, 'judg...","[0.05903188026127519, 0.0, 12.141629462430133,..."
4,Solarbabies,Solarbabies\n\nTranscribed by: Sonja Kemp\n\n\...,4.7,"[solarbabies, transcribed, sonja, kemp, warden...","{'solarbabies': 24, 'transcribed': 1, 'sonja':...","[0.03699331163039912, 0.0, 0.0, 0.0, 0.0354191..."


In [20]:
imdb_level = []
for i in df.imdb:
  if i >= 6:
    imdb_level.append(3)
  elif i >= 3:
    imdb_level.append(2)
  else:
    imdb_level.append(1)

In [45]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


X_train, X_test, y_train, y_test = train_test_split(tfidf, imdb_level, test_size = 0.3)

In [46]:
lr = LogisticRegression().fit(X_train, y_train)
preds = lr.predict(np.array(X_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [47]:
sum(preds == y_test)/len(preds)

0.7722513089005235

### Random Forest

### PCA

In [34]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns

In [33]:
scalar = StandardScaler()
scaled_data = pd.DataFrame(scalar.fit_transform(tfidf))

In [41]:
accuracy = []
for n_comp in range(3, 10):
  pca = PCA(n_components = 3)
  pca.fit(scaled_data)
  data_pca = pca.transform(scaled_data)
  X_train, X_test, y_train, y_test = train_test_split(tfidf, imdb_level, test_size = 0.3)
  lr = LogisticRegression().fit(X_train, y_train)
  preds = lr.predict(np.array(X_test))
  accuracy.append(sum(preds == y_test)/len(preds))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [44]:
for i in range(len(accuracy)):
  print('Number of components: ', i + 3, '; Accuracy: ', accuracy[i])

Number of components:  3 ; Accuracy:  0.7722513089005235
Number of components:  4 ; Accuracy:  0.7722513089005235
Number of components:  5 ; Accuracy:  0.7617801047120419
Number of components:  6 ; Accuracy:  0.7853403141361257
Number of components:  7 ; Accuracy:  0.7984293193717278
Number of components:  8 ; Accuracy:  0.7879581151832461
Number of components:  9 ; Accuracy:  0.7801047120418848
