In [None]:
#!/usr/bin/env python
# Resources Used (as of 01/18/2021)

# NLP Resources
#  - https://nlp.stanford.edu/IR-book/information-retrieval-book.html
#  - https://nlp.stanford.edu/
#  - https://www.nltk.org/
#  - https://www.nltk.org/book/ch05.html

In [None]:
# Libraries
import nltk, re, pandas as pd, numpy as np, string, time, gc as gc, warnings

# Visualization
import seaborn as sns, matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# Other
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_validate, ShuffleSplit, cross_val_score
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import balanced_accuracy_score, accuracy_score, classification_report

# Import all the classifiers
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, ComplementNB, MultinomialNB, BernoulliNB, CategoricalNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
def get_words(para):   
    # Declare regex pattern, nltk defined stop words, and a Snowball stemmer
    pattern = '[0-9]'  
    stop_words = set(stopwords.words('english'))
    stemmer = SnowballStemmer('english')
    
    # Remove punctuation
    no_punctuation = [nopunc.translate(str.maketrans('', '', string.punctuation)) for nopunc in para.lower().split()]
    # Remove integers
    no_integers = [re.sub(pattern, '', i) for i in no_punctuation]
    # Remove non stop words
    dirty_tokens = [data for data in no_integers if data not in stop_words]
    # No empty spaces
    tokens = [data for data in dirty_tokens if data.strip()]
    # No single character "words"
    tokens = [data for data in tokens if len(data) > 1]
    # Stem the remaining tokens
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    # Return them
    return stemmed_tokens

In [None]:
# Other settings
gc.enable()
warnings.filterwarnings("ignore")

In [None]:
# Read ind ata
df = pd.read_csv('IMDB Dataset.csv')

In [None]:
# Remove any duplicate values
df.drop_duplicates(keep='first', inplace=True)

In [None]:
# Create a wordcloud for the entire dataset

# Create a total corpus
corpus_all = ' '.join(s for s in df['review'])

# Generate word cloud
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='salmon', 
                      colormap='Pastel1', collocations=False, stopwords = STOPWORDS).generate(corpus_all)

# Plot
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# Create a wordcloud for positive sentiments

pos_df = df.loc[df['sentiment'] == 'positive']

# Create a total corpus
corpus_pos = ' '.join(s for s in pos_df['review'])

# Generate word cloud
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='salmon', 
                      colormap='Pastel1', collocations=False, stopwords = STOPWORDS).generate(corpus_pos)

# Plot
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# Create a wordcloud for negative sentiments

neg_df = df.loc[df['sentiment'] == 'negative']

# Create a total corpus
corpus_pos = ' '.join(s for s in neg_df['review'])

# Generate word cloud
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='salmon', 
                      colormap='Pastel1', collocations=False, stopwords = STOPWORDS).generate(corpus_pos)

# Plot
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# Take the classes (sentiments)
classes = df['sentiment']

In [None]:
# Move reviews into a series
s = pd.Series(df['review'])
# Create a total corpus
corpus = s.apply(lambda s: ' '.join(get_words(s)))

In [None]:
# Create a frequency representation of the data
count_vectorizer = CountVectorizer(strip_accents='unicode', min_df=50, max_df=df.shape[0]*0.99)
count = count_vectorizer.fit_transform(corpus)
count_df = pd.DataFrame(data=count.todense(), columns=count_vectorizer.get_feature_names())

In [None]:
# Create a boolean representaion of the data
boolean_vectorizer = CountVectorizer(strip_accents='unicode',
                                         min_df=50, max_df=df.shape[0]*0.99, 
                                         binary=True)
boolean = boolean_vectorizer.fit_transform(corpus)
boolean_df = pd.DataFrame(data=boolean.todense(), columns=boolean_vectorizer.get_feature_names())

In [None]:
# Create a tfidf representaion of the data
tfidf_vectorizer = TfidfVectorizer(strip_accents='unicode', min_df=50, max_df=df.shape[0]*0.99)
tfidf = tfidf_vectorizer.fit_transform(corpus)
tfidf_df = pd.DataFrame(data=tfidf.todense(), columns=tfidf_vectorizer.get_feature_names())

In [None]:
# Get a list of classifiers
classifiers = [GaussianNB(), ComplementNB(), MultinomialNB(), BernoulliNB(),
              RandomForestClassifier(), AdaBoostClassifier(), ExtraTreesClassifier(),
              DecisionTreeClassifier(), ExtraTreeClassifier(), RidgeClassifier(),
              LogisticRegression(), SGDClassifier(), MLPClassifier(), KNeighborsClassifier(),
              SVC(), XGBClassifier(), LGBMClassifier()]

In [None]:
# Test Frequency Data
df = count_df.copy()

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(df, classes, stratify=classes, test_size=0.2,
                                                   random_state=8)

for clf in classifiers:
    gc.collect()
    # Get the classifier's name
    clf_name = clf.__class__.__name__            
    # Fit it (turn to numpy for speed)
    train_start = time.time()
    clf.fit(X_train, y_train)
    train_stop = time.time()


    # Get predictions
    preds = clf.predict(X_test)
    
    # Record balanced accuracy, accuracy, and sklearn's classification report
    bal_acc = balanced_accuracy_score(y_test, preds)
    acc = accuracy_score(y_test, preds)
    rpt = classification_report(y_test, preds)    
    
    # Create a pretty print string
    print_str = """
    
    Samples: {}
    Accuracy: {}
    Balanced Accuracy: {}
    
    Report: {}
    """.format(len(preds), acc, bal_acc, rpt)    
    

    # Print results
    print("{} | REPORT FOR {}".format("Frequency", clf_name))
    print("Trained {} in {}".format(clf_name, train_stop-train_start))
    print(print_str)
    print("-----------------------------------------------------------")

In [None]:
# Test TFIDF Data
df = tfidf_df.copy()

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(df, classes, stratify=classes, test_size=0.2,
                                                   random_state=8)

for clf in classifiers:
    gc.collect()
    # Get the classifier's name
    clf_name = clf.__class__.__name__            
    # Fit it (turn to numpy for speed)
    train_start = time.time()
    clf.fit(X_train, y_train)
    train_stop = time.time()


    # Get predictions
    preds = clf.predict(X_test)
    
    # Record balanced accuracy, accuracy, and sklearn's classification report
    bal_acc = balanced_accuracy_score(y_test, preds)
    acc = accuracy_score(y_test, preds)
    rpt = classification_report(y_test, preds)    
    
    # Create a pretty print string
    print_str = """
    
    Samples: {}
    Accuracy: {}
    Balanced Accuracy: {}
    
    Report: {}
    """.format(len(preds), acc, bal_acc, rpt)    
    

    # Print results
    print("{} | REPORT FOR {}".format("TFIDF", clf_name))
    print("Trained {} in {}".format(clf_name, train_stop-train_start))
    print(print_str)
    print("-----------------------------------------------------------")

In [None]:
# Test boolean Data
df = boolean_df.copy()

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(df, classes, stratify=classes, test_size=0.2,
                                                   random_state=8)

for clf in classifiers:
    gc.collect()
    # Get the classifier's name
    clf_name = clf.__class__.__name__            
    # Fit it (turn to numpy for speed)
    train_start = time.time()
    clf.fit(X_train, y_train)
    train_stop = time.time()


    # Get predictions
    preds = clf.predict(X_test)
    
    # Record balanced accuracy, accuracy, and sklearn's classification report
    bal_acc = balanced_accuracy_score(y_test, preds)
    acc = accuracy_score(y_test, preds)
    rpt = classification_report(y_test, preds)    
    
    # Create a pretty print string
    print_str = """
    
    Samples: {}
    Accuracy: {}
    Balanced Accuracy: {}
    
    Report: {}
    """.format(len(preds), acc, bal_acc, rpt)    
    

    # Print results
    print("{} | REPORT FOR {}".format("Boolean", clf_name))
    print("Trained {} in {}".format(clf_name, train_stop-train_start))
    print(print_str)
    print("-----------------------------------------------------------")

In [None]:
# Plot distributions of boolean data
# Loop through the df
for col in boolean_df.columns:        
    # Plot the distribution
    sns.histplot(data=boolean_df[col].to_numpy(), bins=3, 
                 kde=True).set(title="{} Data | {} Distribution".format("Boolean", str(col)))
    # Show the figure
    plt.show()

In [None]:
# Plot distributions of frequency data
# Loop through the df
for col in count_df.columns:        
    # Plot the distribution
    sns.histplot(data=count_df[col].to_numpy(), bins=3, 
                 kde=True).set(title="{} Data | {} Distribution".format("Frequency", str(col)))
    # Show the figure
    plt.show()

In [None]:
# Plot distributions of tfidf data
# Loop through the df
for col in tfidf_df.columns:        
    # Plot the distribution
    sns.histplot(data=tfidf_df[col].to_numpy(), bins=3, 
                 kde=True).set(title="{} Data | {} Distribution".format("Frequency", str(col)))
    # Show the figure
    plt.show()