# EN.605.801 
# Independent Study in Computer Science

## Bayesian Approach to Sentiment Analysis Classification

This notebook intends to provide project status through 06/18/2020. 

&nbsp;

### The accomplished action items thus far:

 - Scale all positive sentiment values x 3 while keeping negative sentiment values at x1
 - Implement K-CFV scheme into testing
 - Create framework for monte-carlo simulations of model performance
 - Create visuals for model performance
 - POS Tagging
 - Curve fitting documentation complete
 - Git repository updated
 - Extended to multiclass Amazon review dataset

&nbsp;

### The following action items require additional work:
 - Determine feature importances
 - Code clean up

# Resources Used (as of 06/25/2020)
&nbsp;

## Text Parsing
 - https://github.com/wundermahn/Yelp-Classification-ML
 
 &nbsp;
 
## NLP Resources
 - https://nlp.stanford.edu/IR-book/information-retrieval-book.html
 - https://nlp.stanford.edu/
 - https://www.nltk.org/
 - https://www.nltk.org/book/ch05.html
 
 &nbsp;
 
## ML / Classification Resources
 - https://scikit-learn.org/stable/modules/naive_bayes.html
 - https://scikit-learn.org/stable/modules/cross_validation.html
 - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
 - https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
 
 &nbsp;
 
## Academic Whitepapers
 - https://www.researchgate.net/publication/221650814_Spam_Filtering_with_Naive_Bayes_-_Which_Naive_Bayes
 - https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf (**)
 - https://www.researchgate.net/publication/221439320_The_Optimality_of_Naive_Bayes
 - https://www.cs.cmu.edu/~knigam/papers/multinomial-aaaiws98.pdf
 - https://www.sciencedirect.com/science/article/pii/S2090447914000550
 - https://www.sciencedirect.com/science/article/pii/S0888613X08001400
 
 &nbsp;
 
## Data
 - https://ai.stanford.edu/~amaas/data/sentiment/
     - https://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf
 - https://analyticsindiamag.com/10-popular-datasets-for-sentiment-analysis/
     - http://www.cs.jhu.edu/~mdredze/datasets/sentiment/

In [1]:
# Perform all necessary imports

import pandas as pd, os, gc as gc, nltk, re, string, numpy as np, time, pickle, warnings
import matplotlib.pyplot as plt, plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_validate, ShuffleSplit, cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
from sklearn.svm import SVC # Per industry "best practice"
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

gc.enable()

In [2]:
# Function to plot the distributions of each word in the df
def plot_df_distributions(df, name):
    # Loop through the df
    for col in df.columns:
        # Create a temp numpy array
        temp = df[col].to_numpy()
        # Plot the distribution
        n, bins, patches = plt.hist(temp, bins=10)
        # Set the title
        plt.suptitle("Data Type: {} | {} Distribution".format(str(name), str(col)))
        # Show the figure
        plt.show()
        
# Function to plot the distributions of each word in the df
def plot_pos_distributions(df, cols):
    # Loop through the df
    for col in cols:
        # Create a temp numpy array
        temp = df[col].to_numpy()
        # Plot the distribution
        n, bins, patches = plt.hist(temp, bins=10)
        # Set the title
        plt.suptitle("POS Tag: {} | Distribution".format(str(col)))
        # Show the figure
        plt.show()        

# Function to plot the results of the classifiers
def plot_results(df, name, metric):
    # Create a plotly figure, and plot whichever metric (best, worst, average) is passed
    fig = px.line(df, x='Simulation', y=metric, color='Classifier')
    # Update title/labels
    fig.update_layout(title='{} Data Classifier Performance'.format(name), xaxis_title='Simulation',
                             yaxis_title='Accuracy')
    # Display the figure
    fig.show()     
    
# This function removes numbers from an array
def remove_nums(arr): 
    # Declare a regular expression
    pattern = '[0-9]'  
    # Remove the pattern, which is a number
    arr = [re.sub(pattern, '', i) for i in arr]    
    # Return the array with numbers removed
    return arr

# This function cleans the passed in paragraph and parses it
def get_words(para, stem):   
    # Create a set of stop words
    stop_words = set(stopwords.words('english'))
    # Split it into lower case    
    lower = para.lower().split()
    # Remove punctuation
    no_punctuation = (nopunc.translate(str.maketrans('', '', string.punctuation)) for nopunc in lower)
    # Remove integers
    no_integers = remove_nums(no_punctuation)
    # Remove stop words
    dirty_tokens = (data for data in no_integers if data not in stop_words)
    # Ensure it is not empty
    tokens = [data for data in dirty_tokens if data.strip()]
    # Ensure there is more than 1 character to make up the word
    tokens = [data for data in tokens if len(data) > 1]
       
    if stem == True:
        # Perform stemming
        stemmer = SnowballStemmer('english')
        stemmed_tokens = [stemmer.stem(word) for word in tokens]
        return stemmed_tokens
    
    else:
        # Return the tokens
        return tokens 

# This function parses NLTK returned POS tuples
def parse_tuples(list_of_tuples, verbose):
    
    # Declare POS counts
    cnt_noun = 0
    cnt_adj = 0
    cnt_vb = 0
    cnt_other = 0
    
    # Loop through the returned tuples
    for tpl in list_of_tuples:
        
        # NOTE - If needed, verbose printing is available to
        # check for completeness.
        
        # If the word is a noun, increase the noun count
        if('NN' in tpl[1]):
            cnt_noun += 1
            if(verbose):
                print("Noun: {}".format(tpl))
        # If the word is an adjective, increase the adjective count
        elif('JJ' in tpl[1]):
            cnt_adj += 1
            if(verbose):
                print("Adjective: {}".format(tpl))
        # If the word is a verb, increase the verb count
        elif('VB' in tpl[1] or 'VP' in tpl[1]):
            cnt_vb += 1
            if(verbose):
                print("Verb: {}".format(tpl))
        # If the word isn't one of those 3, increase the other count
        else:
            cnt_other += 1
            if(verbose):
                print("Other: {}".format(tpl))
    
    # Return the counts
    return cnt_noun, cnt_adj, cnt_vb, cnt_other    

## Section 1: Using K-CFV and Simulated Results
This section will move to using 5-Fold CFV and to simulate results over multiple trials vs 1 static. Both the fold count and trial count are variables set below

In [3]:
folds = 5
sims = 10

In [4]:
# Import the data built by Iteration 1
boolean = pd.read_csv('Trimmed_Boolean.csv')
tfidf = pd.read_csv('Trimmed_TFIDF.csv')
freq = pd.read_csv('Trimmed_Count.csv')

# Create lists of the dataframes, and their names
dfs = [boolean, tfidf, freq]
names = ['Boolean', 'TFIDF', 'Frequency']

# Drop weird column pandas creates when writing to csv
for df in dfs:
    df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
# Start calculating their correlation

## TODO: Turn this into a function

for i in range(len(dfs)):
    # Set figure and axis size
    fig, ax = plt.subplots(figsize = (15,12))
    # Create a temp copy of the df
    temp = dfs[i].copy()
    # https://stackoverflow.com/questions/29294983/how-to-calculate-correlation-between-all-columns-and-remove-highly-correlated-on
    # Create a correlation matrix
    corr_matrix = temp.corr(method='spearman').abs()
    
    # Do some cleaning to remove features that have more than a 95% correlation, or a less trhan 1% correlation
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95) or any(upper[column] < 0.01)]
    temp.drop(to_drop, axis=1, inplace=True)
    
    # Now, re calculate the correlation with good columns
    corr = temp.corr(method='spearman')
    
    # Create a heatmap
    _ = sns.heatmap(corr, fmt="f", linewidths=0.25, center=0, cmap='coolwarm', linecolor='black')
    # Set the title
    ax.set_title('{} Correlation Heatmap'.format(names[i]))
    
    # Show the plot, then close it
    plt.show()
    plt.close()

In [None]:
# Get some quick plots to show the distributions of each word in each dataset
for i in range(len(dfs)):
    plot_df_distributions(dfs[i], names[i])   

In [None]:
# Test some basic classifiers

# Declare a list of classifiers to try
# classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB(), 
#                SVC(C=1.0, kernel='linear', degree=3, gamma='auto')]

classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB()]

# Create blank dataframes to hold the results of the classification attempts
boolean_results = pd.DataFrame(columns = ['Simulation', 'Classifier', 'Best', 'Worst', 'Average'])
tfidf_results = pd.DataFrame(columns = ['Simulation', 'Classifier', 'Best', 'Worst', 'Average'])
freq_results = pd.DataFrame(columns = ['Simulation', 'Classifier', 'Best', 'Worst', 'Average'])

# Run a given number of simulations
for n in range(sims):
    # Loop through each of the datasets
    for i in range(len(dfs)):
        print("*--------------------------------------------*")
        # Copy the dataset
        temp = dfs[i].copy()
        # Randomly shuffle it
        temp = temp.sample(frac=1).reset_index(drop=True)
        # Get your classes, maintaining the same index as the data
        classes = temp['classification']
        temp.drop('classification', inplace=True, axis=1)

        # Try each classifier
        for clf in classifiers:
            # Take the classifiers name
            name = str(clf.__class__.__name__)
            try:  
                # Utilize a shuffle split for the cross validation
                cv = ShuffleSplit(n_splits = folds, test_size = 0.33, random_state=np.random.randint(1,100))
                # Capture the best, worst, and average across the splits
                best = round(max(cross_val_score(clf, temp, classes, cv=cv)), 5)
                worst = round(min(cross_val_score(clf, temp, classes, cv=cv)), 5)
                avg = round((sum(cross_val_score(clf, temp, classes, cv=cv)) / len(cross_val_score(clf, temp, classes, cv=cv))), 5)
                # Print out results
                print("Simulation: {} | Data: {} | Classifier: {} | Best: {} | Worst: {} | Average: {}".format(n+1, names[i], name, best, worst, avg))
            except:
                # Alert if any error
                print("Error Calculating: {}".format(name))

            # Create a row to append to the respective df with the results
            temp_row = [n+1, name, best, worst, avg]
            
            # Append to correct df
            if(names[i] == 'Boolean'):
                boolean_results.loc[len(boolean_results), :] = temp_row
            elif(names[i] == 'TFIDF'):
                tfidf_results.loc[len(tfidf_results), :] = temp_row
            else:
                freq_results.loc[len(freq_results), :] = temp_row
        
        # Delete temp df
        del(temp)
        
        # Free up memory
        gc.collect()

In [None]:
# Store all results in one list
results = [boolean_results, tfidf_results, freq_results]   

# Plot the average across the folds for each dataset
for i in range(len(results)):
    plot_results(results[i], names[i], 'Average')

## Section 2: Rerunning above with scaled positive sentiment data
This portion of the notebook analyzes the effects of multiplying all positive sentiments by 3, while leaving negative sentiments at face value

In [None]:
# Perform the 3x scaling suggested by Professor Johnson

# First, create copies of the dfs
scaled_boolean = boolean.copy()
scaled_tfidf = tfidf.copy()
scaled_freq = freq.copy()

# Create list of scaled dfs
scaled_dfs = [scaled_boolean, scaled_tfidf, scaled_freq]

# Free memory
gc.collect()

In [None]:
# Per Professor Johnson's instructions
for df in scaled_dfs:
    df['classification'].replace(1, 3, inplace=True)
    df['classification'].replace(-1, 1, inplace=True)

In [None]:
# Keep count of a particular word, to ensure it changes
old_written = scaled_freq['written']

# Loop through the scaled dfs
for df in scaled_dfs:
    # Multiply the values by *-1 if the sentiment was negative
    df.update(df.drop('classification',axis=1).mul(df.classification,axis=0)[df.classification.eq(3)])
    # Free up some memory
    gc.collect()
    
# Collect new written values
new_written = scaled_freq['written']

# Print out proof the word counts change
print("Old Written Values: {} | New Written Values: {}".format(sum(old_written), sum(new_written)))

In [None]:
# Get some quick plots to show the distributions of each word in each dataset
for i in range(len(scaled_dfs)):
    plot_df_distributions(scaled_dfs[i], names[i])   

In [None]:
# Test some basic classifiers

# Declare a list of classifiers to try
# Declare a list of classifiers to try
# classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB(), 
#                SVC(C=1.0, kernel='linear', degree=3, gamma='auto')]

classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB()]

# Create blank dataframes to hold the results of the classification attempts
scaled_boolean_results = pd.DataFrame(columns = ['Simulation', 'Classifier', 'Best', 'Worst', 'Average'])
scaled_tfidf_results = pd.DataFrame(columns = ['Simulation', 'Classifier', 'Best', 'Worst', 'Average'])
scaled_freq_results = pd.DataFrame(columns = ['Simulation', 'Classifier', 'Best', 'Worst', 'Average'])

# Run a given number of simulations
for n in range(sims):
    # Loop through each of the datasets
    for i in range(len(scaled_dfs)):
        print("*--------------------------------------------*")
        # Copy the dataset
        temp = scaled_dfs[i].copy()
        # Randomly shuffle it
        temp = temp.sample(frac=1).reset_index(drop=True)
        # Get your classes, maintaining the same index as the data
        classes = temp['classification']
        temp.drop('classification', inplace=True, axis=1)

        # Try each classifier
        for clf in classifiers:
            # Take the classifiers name
            name = str(clf.__class__.__name__)
            try:  
                # Utilize a shuffle split for the cross validation
                cv = ShuffleSplit(n_splits = folds, test_size = 0.33, random_state=np.random.randint(1,100))
                # Capture the best, worst, and average across the splits
                best = round(max(cross_val_score(clf, temp, classes, cv=cv)), 5)
                worst = round(min(cross_val_score(clf, temp, classes, cv=cv)), 5)
                avg = round((sum(cross_val_score(clf, temp, classes, cv=cv)) / len(cross_val_score(clf, temp, classes, cv=cv))), 5)
                # Print out results
                print("Simulation: {} | Data: {} | Classifier: {} | Best: {} | Worst: {} | Average: {}".format(n+1, names[i], name, best, worst, avg))
            except:
                # Alert if any error
                print("Error Calculating: {}".format(name))

            # Create a row to append to the respective df with the results
            temp_row = [n+1, name, best, worst, avg]
            
            # Append to correct df
            if(names[i] == 'Boolean'):
                scaled_boolean_results.loc[len(scaled_boolean_results), :] = temp_row
            elif(names[i] == 'TFIDF'):
                scaled_tfidf_results.loc[len(scaled_tfidf_results), :] = temp_row
            else:
                scaled_freq_results.loc[len(scaled_freq_results), :] = temp_row
        
        # Delete temp df
        del(temp)
        
        # Free up memory
        gc.collect()

In [None]:
# Store all scaled results in one list
scaled_results = [scaled_boolean_results, scaled_tfidf_results, scaled_freq_results]   

# Display their average performance across the folds
for i in range(len(scaled_results)):
    plot_results(scaled_results[i], names[i], 'Average')

### Section 3: POS Tagging
This section will attempt to build a dataframe that, for each review, will count the number of nouns, verbs, adjectives, and other

In [None]:
# In order to properly POS Tag, the raw text, and not vectorized text is needed
raw_text = pd.read_csv('complete_movie_data.csv')
try:
    raw_text.drop('Unnamed: 0', axis=1, inplace=True)
except:
    pass
# Print out the first 10 rows
print(raw_text.head(n=10))

# Free up some memory
gc.collect()

In [None]:
# Declare a blank datafame
new_df = pd.DataFrame(columns = ['ReviewID', 'Nouns', 'Verbs', 'Adjectives', 'Other', 'Class'])

# Loop through the input data
for index, row in raw_text.iterrows():
    # Get the tokens according to discussed rules
    tokens = get_words(row['Text'], True)
    # Create the POS for each word
    res = nltk.pos_tag(tokens)
    # Parse the tuples, and get the counts
    nouns, adjectives, verbs, other = parse_tuples(res, False)
    # Create a temp row
    temp_row = [row['ID'], nouns, verbs, adjectives, other, row['Class']]
    # Append it to the dataframe
    new_df.loc[len(new_df), :] = temp_row

In [None]:
# Plot the distribution of word types across all documents
plot_pos_distributions(new_df, ['Nouns', 'Verbs', 'Adjectives', 'Other'])

In [None]:
# Test some basic classifiers

# Declare a list of classifiers to try
# Declare a list of classifiers to try
# classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB(), 
#                SVC(C=1.0, kernel='linear', degree=3, gamma='auto')]

classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB()]

# Create blank dataframes to hold the results of the classification attempts
pos_results = pd.DataFrame(columns = ['Simulation', 'Classifier', 'Best', 'Worst', 'Average'])

# Run a given number of simulations
for n in range(sims):
    print("*--------------------------------------------*")
    # Create a temp for each simulation
    temp = new_df.copy()
    # Randomly shuffle it
    temp = temp.sample(frac=1).reset_index(drop=True)
    # Get your classes, maintaining the same index as the data
    classes = temp['Class']
    classes = classes.astype('int')
    # Drop irrelevant columns
    temp.drop(['Class', 'ReviewID'], inplace=True, axis=1)

    # Try each classifier
    for clf in classifiers:
        # Take the classifiers name
        name = str(clf.__class__.__name__)
        try:  
            # Utilize a shuffle split for the cross validation
            cv = ShuffleSplit(n_splits = folds, test_size = 0.33, random_state=np.random.randint(1,100))
            # Capture the best, worst, and average across the splits
            best = round(max(cross_val_score(clf, temp, classes, cv=cv)), 5)
            worst = round(min(cross_val_score(clf, temp, classes, cv=cv)), 5)
            avg = round((sum(cross_val_score(clf, temp, classes, cv=cv)) / len(cross_val_score(clf, temp, classes, cv=cv))), 5)
            # Print out results
            print("Simulation: {} | Data: {} | Classifier: {} | Best: {} | Worst: {} | Average: {}".format(n+1, names[i], name, best, worst, avg))
        except:
            # Alert if any error
            print("Error Calculating: {}".format(name))

        # Create a row to append to the respective df with the results
        temp_row = [n+1, name, best, worst, avg]
        pos_results.loc[len(pos_results), :] = temp_row
        
        # Free up memory
        gc.collect()

In [None]:
# Store all scaled results in one list
plot_results(pos_results, "POS Data", 'Average')