In [4]:
!pip install bs4
!pip install nltk

Collecting nltk
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Using cached regex-2022.1.18-cp39-cp39-win_amd64.whl (273 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.7 regex-2022.1.18


In [24]:
import re
import bs4
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import pandas as pd
from tqdm.notebook import tqdm_notebook, tqdm

tqdm.pandas()

In [25]:
# processes a review and returns a list of words
def review_to_words(review, string = True, remove_stopwords=True):
    # Remove HTML
    review_text = BeautifulSoup(review).get_text()
    # Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    # Convert words to lower case and split them
    words = review_text.lower().split()
    # Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if string:
        return " ".join(words)
    else:
        return words

In [41]:
train_df = pd.read_json('../processed_data/train.json')
test_df = pd.read_json('../processed_data/test.json')
train_df = train_df.sample(frac=0.5)
test_df = test_df.sample(frac=0.5)

In [42]:
train_df['words'] = train_df['text'].progress_apply(review_to_words)
test_df['words'] = test_df['text'].progress_apply(review_to_words)

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

In [43]:
# clean_train_reviews = train_df['words'].tolist()
# clean_test_reviews = test_df['words'].tolist()
clean_train_reviews = train_df['words']
clean_test_reviews = test_df['words']

In [44]:
# import statements
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
# Initialize a bag of words  
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 5000) 

# Fit transform the data
train_feat = vectorizer.fit_transform(clean_train_reviews).toarray()
test_feat = vectorizer.transform(clean_test_reviews).toarray()

In [46]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

In [47]:
# Get predicitons
def get_preds(test_feat, train_feat, y_test, y_train, model, title='Random Forest'):
    model.fit(train_feat, y_train)
    preds = model.predict(test_feat)
    fpr, tpr, _ = roc_curve(y_test, preds)
    roc_auc = auc(fpr, tpr)
    print('AUC:', roc_auc)
    plt.plot(fpr, tpr)
    plt.title(title)
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()
    return preds

In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [49]:
mapping = {'+': 1, '-': -1}
train_df.replace({'label': mapping})
test_df.replace({'label': mapping})

Unnamed: 0,id,text,rating,label,words
4649,2935,Having dabbled in the modeling industry (as a ...,7,1,dabbled modeling industry model watch show sli...
19324,4893,This movie was ridiculous from the start. Let ...,1,-1,movie ridiculous start let save time watching ...
9728,7506,PLOT SPOILERS!!!! Dr. Boch (George C. Scott) i...,7,1,plot spoilers dr boch george c scott chief med...
23013,8212,"The basic idea behind ""Dungeon of Harrow"" isn'...",2,-1,basic idea behind dungeon harrow bad acting ho...
20179,5662,iCarly is about a teenage girl named Carly Sha...,1,-1,icarly teenage girl named carly shay miranda c...
...,...,...,...,...,...
17836,3553,I had high hopes for this one after reading ea...,2,-1,high hopes one reading earlier reviews slow pl...
15727,1655,This is one of the worst pieces of cinema I ha...,1,-1,one worst pieces cinema seen time also first r...
24184,9267,"With a cheap pound shop having just opened,i t...",4,-1,cheap pound shop opened thought worth looking ...
18803,4423,This movie is a waste of time. Though it has a...,3,-1,movie waste time though actors potential somet...


In [50]:
train_sentiment = train_df['text']
test_sentiment = test_df['text']

In [51]:
preds_rf = get_preds(test_feat, train_feat, 
                  test_sentiment, train_sentiment, 
                     RandomForestClassifier())
#                   RandomForestClassifier(n_estimators = 100))

# preds_nb = get_preds(test_feat, train_feat, 
#                   test_sentiment, train_sentiment, 
#                   MultinomialNB(), title='Naive Bayes')

MemoryError: could not allocate 1635516416 bytes