In [None]:
#!/usr/bin/env python
# Resources Used (as of 06/25/2020)

# NLP Resources
#  - https://nlp.stanford.edu/IR-book/information-retrieval-book.html
#  - https://nlp.stanford.edu/
#  - https://www.nltk.org/
#  - https://www.nltk.org/book/ch05.html

In [None]:
# Libraries
import nltk, re, pandas as pd, numpy as np, string, time, gc as gc, warnings

# Visualization
import seaborn as sns, matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# Other
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_validate, ShuffleSplit, cross_val_score
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import balanced_accuracy_score

# Import all the classifiers
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, ComplementNB, MultinomialNB, BernoulliNB, CategoricalNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
# Other settings
gc.enable()
warnings.filterwarnings("ignore")

In [None]:
# Useful helper functions

# Function to plot the distributions of each word in the df
def plot_df_distributions(df, name):
    # Loop through the df
    for col in df.columns:        
        # Plot the distribution
        sns.histplot(data=df[col].to_numpy(), bins=3, 
                     kde=True).set(title="{} Data | {} Distribution".format(name, str(col)))
        # Show the figure
        plt.show()
               
# This function removes numbers from an array
def remove_nums(arr): 
    # Declare a regular expression
    pattern = '[0-9]'  
    # Remove the pattern, which is a number
    arr = [re.sub(pattern, '', i) for i in arr]    
    # Return the array with numbers removed
    return arr

# This function cleans the passed in paragraph and parses it
def get_words(para, stem):   
    # Create a set of stop words
    stop_words = set(stopwords.words('english'))
    # Split it into lower case and remove punctuation
    no_punctuation = [nopunc.translate(str.maketrans('', '', string.punctuation)) for nopunc in para.lower().split()]
    # Remove integers
    no_integers = remove_nums(no_punctuation)
    # Remove stop words
    dirty_tokens = [data for data in no_integers if data not in stop_words]
    # Ensure it is not empty
    tokens = [data for data in dirty_tokens if data.strip()]
    # Ensure there is more than 1 character to make up the word
    tokens = [data for data in tokens if len(data) > 1]
       
    if stem == True:
        # Perform stemming
        stemmer = SnowballStemmer('english')
        stemmed_tokens = [stemmer.stem(word) for word in tokens]
        return stemmed_tokens
    
    else:
        # Return the tokens
        return tokens 
    
# Function to build 3 frequency representations of the data
# TFIDF, Boolean, and TF
def build_dataframes(raw, min_df=None, max_df=None):
    # Capture the classes
    classes = raw['sentiment']
    # Capture the reviews as a Series
    s = pd.Series(raw['review'])
    # Create a total corpus
    corpus = s.apply(lambda s: ' '.join(get_words(s, True)))
    
    ## TODO:
    # Could add verbose?
    # Probably could put some error handling for min/max df values here
    
    # Create vectorizers to parse the data
    trimmed_boolean_vectorizer = CountVectorizer(strip_accents='unicode',
                                             min_df=min_df, max_df=max_df, 
                                             binary=True)
    trimmed_tfidf_vectorizer = TfidfVectorizer(strip_accents='unicode', min_df=min_df, max_df=max_df)
    trimmed_count_vectorizer = CountVectorizer(strip_accents='unicode', min_df=min_df, max_df=max_df)
    
    # Create transformers and fit the corpus
    trimmed_tfidf = trimmed_tfidf_vectorizer.fit_transform(corpus)
    trimmed_count = trimmed_count_vectorizer.fit_transform(corpus)
    trimmed_boolean = trimmed_boolean_vectorizer.fit_transform(corpus)
    
    # Free up memory
    gc.collect()
    
    # Create dataframes of the results
    trimmed_boolean_df = pd.DataFrame(data = trimmed_boolean.todense(), columns = trimmed_boolean_vectorizer.get_feature_names())
    trimmed_tfidf_df = pd.DataFrame(data = trimmed_tfidf.todense(), columns = trimmed_tfidf_vectorizer.get_feature_names())
    trimmed_count_df = pd.DataFrame(data = trimmed_count.todense(), columns = trimmed_count_vectorizer.get_feature_names())

    # Add the classification column to the dataframes
    for df in [trimmed_boolean_df, trimmed_tfidf_df, trimmed_count_df]:
        df['classification'] = classes
        
    # Return the three dataframes
    return trimmed_boolean_df, trimmed_tfidf_df, trimmed_count_df

# Take in the actual classes and the predicted classes and create a report
def score_model(actuals, preds):
    # Convert to numpy
    try:
        actuals=actuals.to_numpy()
    except:
        pass
    
    # Record balanced accuracy, accuracy, and sklearn's classification report
    bal_acc = balanced_accuracy_score(actuals, preds)
    acc = accuracy_score(actuals, preds)
    rpt = classification_report(actuals, preds)    
    
    # Create a pretty print string
    print_str = """
    
    Samples: {}
    Accuracy: {}
    Balanced Accuracy: {}
    
    Report: {}
    """.format(len(preds), acc, bal_acc, rpt)
    
    return print_str

def run_classification(data, names, classifiers):
    # Data: List of dataframes on which to test the algorithms
    # Names: Names of their corresponding dataframes
    # Classifiers: ML Classifiers to test
    ## TODO: Add some smarts to handle input types, lengths, etc. For now, just assume
    
    for i in range(len(data)):
        # Assign variables
        df = data[i]
        classes = df['classification']
        df.drop('classification', inplace=True, axis=1)
        df_name = names[i]
                
        # Split into training and testing
        X_train, X_test, y_train, y_test = train_test_split(df, classes, stratify=classes, test_size=0.2,
                                                           random_state=8)
        
        for clf in classifiers:
            # Get the classifier's name
            clf_name = clf.__class__.__name__            
            # Fit it (turn to numpy for speed)
            train_start = time.time()
            clf.fit(X_train.to_numpy(), y_train.to_numpy())
            train_stop = time.time()
        
                    
            # Get results
            res = score_model(y_test.to_numpy(), X_test.to_numpy())
            
            # Print results
            print("REPORT FOR {}".format(clf_name))
            print("Trained {} in {}".format(clf_name, train_stop-train_start))
            print(res)
            print("-----------------------------------------------------------")

In [None]:
# Utilities
df = pd.read_csv('IMDB Dataset.csv')

In [None]:
# Print out the DF
df

In [None]:
# Remove any duplicate values
df.drop_duplicates(keep='first', inplace=True)

In [None]:
# Print out the dataframe again
df

In [None]:
# Show class imbalance (slight)
df['sentiment'].value_counts()

In [None]:
# Create a wordcloud for the entire dataset

# Create a total corpus
corpus = ' '.join(s for s in df['review'])

# Generate word cloud
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='salmon', 
                      colormap='Pastel1', collocations=False, stopwords = STOPWORDS).generate(corpus)

# Plot
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# Create a wordcloud for positive sentiments

pos_df = df.loc[df['sentiment'] == 'positive']

# Create a total corpus
corpus_pos = ' '.join(s for s in pos_df['review'])

# Generate word cloud
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='salmon', 
                      colormap='Pastel1', collocations=False, stopwords = STOPWORDS).generate(corpus_pos)

# Plot
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# Create a wordcloud for positive sentiments

neg_df = df.loc[df['sentiment'] == 'negative']

# Create a total corpus
corpus_pos = ' '.join(s for s in neg_df['review'])

# Generate word cloud
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='salmon', 
                      colormap='Pastel1', collocations=False, stopwords = STOPWORDS).generate(corpus_pos)

# Plot
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
boolean, tfidf, freq = build_dataframes(df, min_df=100, max_df=df.shape[0]*0.99)

In [None]:
# Plot distributions of boolean dataframe
plot_df_distributions(boolean, "Boolean")

In [None]:
# Plot distributions of tfidf dataframe
plot_df_distributions(tfidf, "tfidf")

In [None]:
# Plot distributions of frequency dataframe
plot_df_distributions(tfidf, "frequency")

In [None]:
# Get a list of classifiers
classifiers = [RandomForestClassifier(), AdaBoostClassifier(), ExtraTreesClassifier(),
              DecisionTreeClassifier(), ExtraTreeClassifier(), RidgeClassifier(),
              LogisticRegression(), SGDClassifier(), MLPClassifier(), KNeighborsClassifier(),
              SVC(), GaussianNB(), ComplementNB(), MultinomialNB(), BernoulliNB(),
              CategoricalNB(), XGBClassifier(), LGBMClassifier()]