**Use a Random Forest Classifier to Obtain Review Sentiment**

In [148]:
import numpy as np
import pandas as pd
import gensim
from gensim.models import Doc2Vec
import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup 
import re 
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from gensim.summarization import keywords
from gensim.summarization.summarizer import summarize

In [40]:
df = pd.read_csv('data/Final Dataframe/dataframe_final.csv')[23391:]

In [41]:
df.head()

Unnamed: 0.1,Unnamed: 0,review_id,review_date,review_text,agree_score,disagree_score,funny_score,professor_id,professor_name,review_id.1,course_id,course_name,workload,workload_label,review_label
23391,23391,78219,1/1/2015 0:00,\r\r\r\nAli altug is a great teacher! Takes ti...,1,1,0,11929.0,"Altug, Ali",78219,1507.0,[MATH V1106] Calculus IIS,Weekly homework sets \r\r\r\n2 midterms and a ...,,2.0
23392,23392,78220,1/1/2015 0:00,"\r\r\r\nOkay, at the risk of being anonymously...",1,1,1,3980.0,"Baswell, Christopher",78220,5314.0,[ENGL W4917] Writing on Disability,Very reasonable. He uses the midterms to make ...,,2.0
23393,23393,78221,1/1/2015 0:00,\r\r\r\nProf Gray is the most overrated teache...,2,9,1,2468.0,"Gray, Erik",78221,3390.0,Romantic Poetry,readings for class. 2 papers. a final exam. Aw...,,1.0
23394,23394,78223,1/2/2015 0:00,\r\r\r\nSid’s a pretty chill guy. He works at ...,6,0,0,7069.0,"Dastidar, Siddhartha",78223,6828.0,[IEOR E4711] Global Capital Markets,\r\r\r\n4 HWs\r\r\r\n1 midterm\r\r\r\n1 final\...,,2.0
23395,23395,78224,1/2/2015 0:00,"\r\r\r\nUnfortunately, I'm 1.5 years too late ...",1,0,0,13059.0,"Kun, Ilya",78224,,,"Very reasonable. Just about every class, write...",,2.0


In [42]:
df.review_label = np.where(np.isnan(df.review_label) == True, 1, df.review_label)
df.review_label = np.where(df.review_label == 3, 2, df.review_label)
df.review_label = np.where(df.review_label == 0, 1, df.review_label)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.review_text, df.review_label, test_size = 0.15, random_state = 42)

**Preprocessing Text Data**

In [None]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [71]:
def get_tokens(review, tokenizer = tokenizer, remove_stopwords=False):
    
    tokens = []
    
    # Remove html tags
    review_text = BeautifulSoup(review, 'lxml').get_text()
    
    # 2. Remove non-letter.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    
    # Converting to lower case
    review_text = review_text.lower().strip()
    
    # Tokenize
    
    tokens = nltk.tokenize.word_tokenize(review_text)
    
    # Remove short words
    tokens = [t for t in tokens if len(t) > 2]
    
    # Lemmatize
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    
    # Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        tokens = [t for t in tokens if t not in stopwords]
        
    return tokens

In [115]:
train_words = []

for x in tqdm(X_train):
    train_words.append(get_tokens(x))

100%|█████████████████████████████████████████████████████████████████████████████| 1851/1851 [00:05<00:00, 338.03it/s]


In [116]:
test_words = []

for x in tqdm(X_test):
    test_words.append(get_tokens(x))

100%|███████████████████████████████████████████████████████████████████████████████| 327/327 [00:00<00:00, 349.12it/s]


In [114]:
test_words

[]

In [84]:
model = word2vec.Word2Vec(tokens, min_count=1, window=10, size = 100, sample = 1e-4, negative = 5, workers = 4)

In [108]:
def get_vector(tokens, model, num_features = 100):
    # Pre-initialising empty numpy array for speed
    vec = np.zeros(num_features, dtype="float32")
    nwords = 0
    
    index2word_set = set(model.wv.index2word)
    
    for t in  tokens:
        if t in index2word_set:
            nwords += 1
            vec = np.add(vec, model[t])

    return vec/nwords

In [140]:
get_vector(train_words[0], model)

  # This is added back by InteractiveShellApp.init_path()


array([-0.5201261 , -0.01457011,  0.01409439, -0.15438278,  0.17632215,
        0.1951441 , -0.21365334, -0.51284015,  0.11547894,  0.2985927 ,
        0.14824753,  0.5504369 , -0.21594507,  0.02095696, -0.43537873,
        0.09777957, -0.25318637, -0.9861365 , -0.313873  ,  0.516276  ,
       -0.3066933 , -0.16042075,  0.08250234,  0.42578763,  0.025206  ,
       -0.19377595,  1.0170013 ,  1.0052099 ,  0.15070602, -0.00375615,
        0.6878507 , -0.39442798,  0.44364023, -0.26630798, -0.3418949 ,
        0.65762407,  0.31151134, -0.53137   ,  0.18513793,  0.02172099,
       -0.46597007,  0.11907882, -0.01957696, -0.4639639 ,  0.38795373,
       -0.17394426,  0.08316743, -0.61706066,  0.1035533 ,  0.6003404 ,
       -0.86080825,  0.5680906 ,  0.02415073,  0.6480171 , -0.6243746 ,
       -0.04765426, -0.51644903,  0.42157203, -0.04098596,  0.07487419,
       -0.07984081, -0.26711944, -0.49153793, -0.2357799 ,  0.6053948 ,
        0.07163803,  0.09339514, -0.42713803,  0.6514242 ,  0.19

In [146]:
train_vecs = [get_vector(w, model) for w in train_words]

  # This is added back by InteractiveShellApp.init_path()


In [143]:
test_vecs = [get_vector(w, model) for w in test_words]

  # This is added back by InteractiveShellApp.init_path()


In [151]:
def binarize_score(score):
    """
    set scores of 1-3 to 0 and 4-5 as 1
    """
    
    if score == 1:
        return 0
    else:
        return 1

In [155]:
y_train = y_train.map(binarize_score)
y_test = y_test.map(binarize_score)

In [156]:
clf = SGDClassifier(loss='log')
clf.fit(train_vecs, y_train)
p=clf.predict_proba(test_vecs)
roc_auc_score(y_test, p[:,1])



0.5632272139625081