In [None]:
%matplotlib inline

import pandas as pd
import time
import nltk
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from scipy import stats
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


In [None]:
# for local
# processed_directory = '../processed_data/'

# for google drive
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/cs4248-project/processed_data'
    print(path_to_file)
    # move to Google Drive directory
    os.chdir(path_to_file)
    !pwd

Mounted at /content/gdrive
/content/gdrive/My Drive/cs4248-project/processed_data
/content/gdrive/My Drive/cs4248-project/processed_data


In [None]:
# for local
# train_df = pd.read_json('../processed_data/train.json')
# test_df = pd.read_json('../processed_data/test.json')

# for google drive
data_path = '/content/gdrive/My Drive/cs4248-project/processed_data'
train_df = pd.read_json(data_path+'/train.json')
test_df = pd.read_json(data_path+'/test.json')

In [None]:
mapping = {'+': 1, '-': -1}

train_df = train_df.replace({'label': mapping})
test_df = test_df.replace({'label': mapping})

In [None]:
train_df.sample(frac=1).head(10)

Unnamed: 0,id,text,rating,label
855,10770,"With a well thought out cast, this movie was a...",7,1
24033,9130,It is true that some fans of Peter Sellers wor...,4,-1
17957,3662,"""La Furia del Hombre Lobo"" forms a completely ...",4,-1
4200,2530,One of the joys of picking up the recent Bela ...,8,1
20821,623,Scarecrow is set in the small American town of...,3,-1
21270,6644,"Yes, he is! ...No, not because of Pintilie lik...",1,-1
13911,11270,I have seen bad films but this took the p***. ...,1,-1
23775,889,"I've never been to Paris, but after seeing ""Pa...",3,-1
2103,11894,"A truly adorable heroine who, at turns, is sur...",10,1
5716,3896,"Some people loved ""The Aristocrats"" and others...",7,1


In [None]:
# clean text
nltk.download('stopwords')
stops = set(stopwords.words('english'))
ps = PorterStemmer()

def clean_text_util(text):
  text = re.sub(r'[^\w\s]', '', text)
  text = str(text).lower().strip()
  text_arr = text.split()
  x = []

  for word in text_arr:
    if word not in stops:
      x.append(word)

  y = []
  for word in x:
    word = ps.stem(word)
    y.append(word)
              
  text = " ".join(y)
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train_df["text"] = train_df["text"].apply(lambda word: clean_text_util(word))

train_df.head()

Unnamed: 0,id,text,rating,label
0,0,bromwel high cartoon comedi ran time program s...,9,1
1,10000,homeless houseless georg carlin state issu yea...,8,1
2,10001,brilliant overact lesley ann warren best drama...,10,1
3,10002,easili underr film inn brook cannon sure flaw ...,7,1
4,10003,typic mel brook film much less slapstick movi ...,8,1


In [None]:
test_df["text"] = test_df["text"].apply(lambda word: clean_text_util(word))

test_df.head()

Unnamed: 0,id,text,rating,label
0,0,went saw movi last night coax friend mine ill ...,10,1
1,10000,actor turn director bill paxton follow promis ...,7,1
2,10001,recreat golfer knowledg sport histori pleas di...,9,1
3,10002,saw film sneak preview delight cinematographi ...,8,1
4,10003,bill paxton taken true stori 1913 us golf open...,8,1


In [None]:
# vectorize using tfidf
vectorizer = TfidfVectorizer(max_features=10000)

vectorizer.fit(train_df["text"])
X_train = vectorizer.transform(train_df["text"])
X_test = vectorizer.transform(test_df["text"])

In [None]:
# classifier
classifier = svm.SVC(kernel='linear')
classifier.fit(X_train, train_df['label'])
prediction = classifier.predict(X_test)

f1 = f1_score(test_df['label'], prediction)
report = classification_report(test_df['label'], prediction, output_dict=True)
print('F1 score:', f1)
print('positive: ', report['1'])
print('negative: ', report['-1'])

F1 score: 0.8712743676494281
positive:  {'precision': 0.8773523685918235, 'recall': 0.86528, 'f1-score': 0.8712743676494281, 'support': 12500}
negative:  {'precision': 0.8671085858585859, 'recall': 0.87904, 'f1-score': 0.8730335293182901, 'support': 12500}
