# SENTIMENT ANALYSIS 

In this document, we will be classifying for Google and IOS app store reviews using sentiment analysis into positive, neutral and negative reviews. The result would be separating the original reviews data into 3 separate datasets, and conducting topic modeling on each. The steps to achieve this are as follows: 

#### Manual data labeling 
1. Calculate the mean score (rating) across dataset  
2. Calculate the standard deviation of the scores/rating across dataset  
3. Calculate and store $\mu + \sigma$ in variables called as “upper_accepted” and “lower_accepted”  
4. Manually label data (add column "man_label" to data), as follows:  
	A. If score/rating > upper_accepted  $\rightarrow$  man_label = positive  
	B. If score < lower_accepted $\rightarrow$ man_label = negative  
	C. If lower_accepted $\leq$  score $\leq$ upper_accepted  $\rightarrow$ man_label = neutral  


#### Sentiment Analysis Labeling 
0. Do cross-validation (train, test, validation) 
1. Use various sentiment analysis tools to label data 
2. Use other classification and clustering algorithms to label data 
3. Find the algorithm/technique with the most accurate results (compared with our manual labeling)
4. Do sanity checks of reading the reviews by authors to make sure the labels created make sense  
5. Save 3 separate datasets, `positive_reviews`, `negative_reviews` and `neutral_reviews` 

#### TOPIC MODELING for each 
Next, in a separate file, each of the three datasets above (separately for iOS and Google, of course) will go through topic modeling.  

In [3]:
%%capture 
!pip install nltk
!pip install gensim
!pip install itertools
!pip install spacy
!pip install langdetect
!pip install pprint
!pip install pyLDAvis
!pip install vaderSentiment
!pip install textblob
!pip install keras
!pip install transformers
!pip install tensorflow

In [13]:
import os
import re
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.corpora import MmCorpus
from collections import defaultdict
import itertools
from gensim.models.tfidfmodel import TfidfModel
import spacy
from langdetect import detect, DetectorFactory
from gensim.models import CoherenceModel
from string import punctuation
from pprint import pprint
import gensim.models
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.util import ngrams
from gensim.models import Phrases
from gensim.models.phrases import Phraser
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis.gensim
import pyLDAvis.gensim_models

from textblob import TextBlob
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

from textblob import Word
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from nltk.sentiment import SentimentIntensityAnalyzer

from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)

from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.layers import LeakyReLU

from gensim.models import KeyedVectors
from transformers import TFBertModel, BertTokenizer
import tensorflow as tf

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

import matplotlib.pyplot as plt
%matplotlib inline


In [6]:
nltk.download([
    "names",
    "stopwords",
    "state_union",
    "twitter_samples",
    "movie_reviews",
    "averaged_perceptron_tagger",
    "vader_lexicon",
    "punkt"])

[nltk_data] Downloading package names to /home/yekta/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.
[nltk_data] Downloading package stopwords to /home/yekta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to
[nltk_data]     /home/yekta/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/yekta/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/yekta/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yekta/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/yekta/nltk_data...
[nltk_data] Downloading package punkt to /home/yekta/nltk_d

True

In [7]:
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

In [8]:
# ---------------------- START OF CHATGPT CODE
# PROMPT was: finish this function: def save_file(name, extension, content):
def save_file(name, extension, content):
    """
    Save a file with the specified name, extension, and content.

    Args:
    name (str): The name of the file (without extension).
    extension (str): The file extension (e.g., 'txt', 'csv').
    content (str): The content to be saved in the file.

    Returns:
    None
    """
    # Combine name and extension to form the full file name
    filename = f"{name}.{extension}"

    # Check if the file already exists
    if os.path.exists(filename):
        print(f"The file {filename} already exists. Overwriting...")
    
    # Open the file and write the content
    try:
        with open(filename, 'w') as file:
            file.write(content)
        print(f"File {filename} saved successfully.")
    except Exception as e:
        print(f"Error while saving file: {e}")
# ---------------------- END OF CHATGPT CODE 



In [150]:
def remove_punc(text):
#     """Takes in some text, removes specific punctuations from them"""
#     punctuation_to_remove = r"[\[\]\(\)!?‘’'\'\"\.,;:]"
#     # punctuation_to_remove = r"[\[\]\(\)!?\"\.,;:]"
#     no_punc = re.sub(punctuation_to_remove, "", text)
#     no_punc_words = [word for word in text if word not in punctuation]
#     no_punc = ''.join(no_punc_words)
    
#     return no_punc
    punctuation_to_remove = r"[\[\]\(\)!?‘’'\".,;:]"
    no_punc = re.sub(punctuation_to_remove, "", text)
    return no_punc

stemmer = PorterStemmer()


def remove_stopwords_from_sentence(sentence):

    filtered_words = [word for word in sentence.split() if word.lower() not in stop_words]
    
    filtered_sentence = ' '.join(filtered_words)

    return filtered_sentence

In [152]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 'reply', 'banking', 'bank', 'app', 'apps', 'banks', 'cibc', 'rbc', 'td', 'scotia', 'bmo'])

## MANUAL DATA LABELING 

### IOS 

In [9]:
ios_data = pd.read_csv("top5banksReviews_v1.csv")
df_copy = ios_data.copy() # keeping a copy of the original data 

In [10]:
# ---------------------- START OF CHATGPT CODE
def is_english(text):
    DetectorFactory.seed = 0
    """input needs to be string"""
    try:
        return detect(text) == 'en'
    except:
        return False  
    
ios_data['is_english'] = ios_data['review'].apply(is_english)

# ---------------------- END OF CHATGPT CODE 

ios_data = ios_data[ios_data['is_english']].drop(columns=['is_english']) 

In [11]:
ios_data.to_csv("ios_english_only.csv")

In [12]:
ios_data.head()

Unnamed: 0,rating,title,review,Bank
0,4,Could Use Improvements,"If there is only one possible account, I shoul...",BMO
1,4,Verify,Suddenly I am being asked to verify this devic...,BMO
2,1,Works terrible on my new phone,"On my new phone, practically every time I open...",BMO
3,1,Bare bones app,No features to make this app stand out amongst...,BMO
4,1,Not working with iOS12.2,This app won’t work with iOS12.2. Every time I...,BMO


In [15]:
average_ios_ratings = np.mean(ios_data['rating'])
average_ios_ratings

2.5542060278902383

In [17]:
std_ios_ratings = np.std(ios_data['rating'])
std_ios_ratings

1.675954978320234

In [21]:
upper_accepted = int(average_ios_ratings) + int(std_ios_ratings)
lower_accepted = int(average_ios_ratings) - int(std_ios_ratings)

In [55]:
def manual_labeling(score):
    man_label = ' '
    
    if score > upper_accepted: 
        man_label = 'positive'
    elif score <= lower_accepted: 
        man_label = 'negative'
    else:
        man_label = 'neutral'
        
    return man_label

In [56]:
ios_data['man_label'] = ios_data['rating'].apply(manual_labeling)    

In [57]:
ios_data.head()

Unnamed: 0,rating,title,review,Bank,man_label
0,4,Could Use Improvements,"If there is only one possible account, I shoul...",BMO,positive
1,4,Verify,Suddenly I am being asked to verify this devic...,BMO,positive
2,1,Works terrible on my new phone,"On my new phone, practically every time I open...",BMO,negative
3,1,Bare bones app,No features to make this app stand out amongst...,BMO,negative
4,1,Not working with iOS12.2,This app won’t work with iOS12.2. Every time I...,BMO,negative


In [58]:
ios_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11115 entries, 0 to 11979
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   rating     11115 non-null  int64 
 1   title      11114 non-null  object
 2   review     11115 non-null  object
 3   Bank       11115 non-null  object
 4   man_label  11115 non-null  object
dtypes: int64(1), object(4)
memory usage: 521.0+ KB


##### TextBLOB

In [84]:
polarity_blob = [TextBlob(review).sentiment.polarity for review in ios_data['review']]
# polarity_test = [TextBlob(review).sentiment.polarity for review in x_test]

In [85]:
polarity_blob[:10]

[0.11499999999999999,
 0.0,
 0.0957070707070707,
 -0.10833333333333334,
 0.0,
 -0.3,
 -0.3569444444444445,
 0.05416666666666667,
 -0.046875,
 -0.4212121212121212]

In [86]:
subjectivity_blob = [TextBlob(review).sentiment.subjectivity for review in ios_data['review']]
# subjectivity_test = [TextBlob(review).sentiment.subjectivity for review in x_test]

In [87]:
subjectivity_blob[:10]

[0.595,
 0.5,
 0.3366161616161616,
 0.425,
 0.25,
 0.5499999999999999,
 0.5673611111111111,
 0.4,
 0.42083333333333334,
 0.6848484848484849]

In [93]:
def convert_polarity_to_label(pol_scores):
    """ 
    Takes in as input a list of polarity scores, and replaces them with labels of positive, negative or neutral
    According to this: 
        * The value of polarity is between -1 and +1 --- a distance of 2, if I break it into 3 sections 
        * A value closer to -1 means more negative 
        * A value closer to +1 means more positive 
    I have set a harsh threshold of 0.6 (more than 0.5)
    Anything less than -1/3 is negative 
    Anything between -1/3 and 1/3 is neutral
    Anything between 1/3 and 1 is positive 
    """
    labels = []
    
    for score in pol_scores:
        if score <= -1*(1/3):
            labels.append('negative')
        elif score > -1 * (1/3) and score < 1/3: 
            labels.append('neutral')
        else:
            labels.append('positive')
            
    return labels
    

In [94]:
blob_labels = convert_polarity_to_label(polarity_blob)
ios_data['blob_labels'] = pd.Series(blob_labels)

In [95]:
ios_data.head()

Unnamed: 0,rating,title,review,Bank,man_label,blob_labels
0,4,Could Use Improvements,"If there is only one possible account, I shoul...",BMO,positive,neutral
1,4,Verify,Suddenly I am being asked to verify this devic...,BMO,positive,neutral
2,1,Works terrible on my new phone,"On my new phone, practically every time I open...",BMO,negative,neutral
3,1,Bare bones app,No features to make this app stand out amongst...,BMO,negative,neutral
4,1,Not working with iOS12.2,This app won’t work with iOS12.2. Every time I...,BMO,negative,neutral


In [96]:
def calculate_success(compare, what):
    count_success = 0
    compare = list(compare)
    what = list(what)

    for i in range(len(compare)):
        if compare[i] == what[i]:
            count_success += 1
        else:
            count_success = count_success

    print(count_success)
    print("% of success: ", count_success/len(compare))


print("TextBlob accuracy")
calculate_success(ios_data['man_label'], ios_data['blob_labels'])

TextBlob accuracy
2637
% of success:  0.23724696356275304


##### VADER

In [97]:
sentiment = SentimentIntensityAnalyzer()

In [99]:
vader_polarity = [sentiment.polarity_scores(review) for review in ios_data['review']]
# vader_testing = [sentiment.polarity_scores(review) for review in x_test]


In [106]:
vader_polarity[:3]

[{'neg': 0.0, 'neu': 0.941, 'pos': 0.059, 'compound': 0.4404},
 {'neg': 0.0, 'neu': 0.891, 'pos': 0.109, 'compound': 0.4696},
 {'neg': 0.026, 'neu': 0.92, 'pos': 0.054, 'compound': 0.6586}]

In [146]:
def convert_vader_to_label(vader_scores):
    """ 
    Takes in as input a list of vader dictionary scores.
    Each vader dictionary has scores for 'neg', 'neu' and 'pos'.
    This function finds the labels based on the compound score; a value between -1 and 1 
        * The value of compound is between -1 and +1 --- a distance of 2, if I break it into 3 sections 
        * A value closer to -1 means more negative 
        * A value closer to +1 means more positive 
    Anything less than -1/3 is negative 
    Anything between -1/3 and 1/3 is neutral
    Anything between 1/3 and 1 is positive 
    """
    labels = []
    
    for dict_ in vader_scores:
        #first, loop through the lists and grab the dictionary - each dictionary is one review 
        compound = dict_['compound']
        
        if compound <= -0.05:
            labels.append("negative")
        elif compound >= 0.05:
            labels.append("positive")
        else:
            labels.append("neutral")
    
    #another way to do it: 
#     for dict_ in vader_scores:
#         #---------------------------- START CHATGPT CODE 
#         dict_without_compound = {k: v for k, v in dict_.items() if k != "compound"}
#         max_key = max(dict_without_compound, key = dict_without_compound.get)
#         labels.append(max_key)
#         #---------------------------- END CHATGPT CODE 
                
#     for i in range(len(labels)):
#         if labels[i] == 'neg':
#             labels[i] = 'negative'
#         elif labels[i] == 'pos':
#             labels[i] = 'positive'
#         else:
#             labels[i] = 'neutral'
    
    return labels

In [147]:
vader_labels = convert_vader_to_label(vader_polarity)
ios_data['vader_labels'] = pd.Series(vader_labels)

In [148]:
print("Vader accuracy")
calculate_success(ios_data['man_label'], ios_data['vader_labels'])

Vader accuracy
3696
% of success:  0.33252361673414305


In [149]:
ios_data.head()

Unnamed: 0,rating,title,review,Bank,man_label,blob_labels,vader_labels
0,4,Could Use Improvements,"If there is only one possible account, I shoul...",BMO,positive,neutral,positive
1,4,Verify,Suddenly I am being asked to verify this devic...,BMO,positive,neutral,positive
2,1,Works terrible on my new phone,"On my new phone, practically every time I open...",BMO,negative,neutral,positive
3,1,Bare bones app,No features to make this app stand out amongst...,BMO,negative,neutral,negative
4,1,Not working with iOS12.2,This app won’t work with iOS12.2. Every time I...,BMO,negative,neutral,negative


##### MULTINOMIAL NAIVE BAYES

In [153]:
reviews = list(ios_data['review']) 
reviews_corpus = " ".join([str(review) for review in reviews])
tokenized_doc_to_sentences = sent_tokenize(reviews_corpus)
tokenized_doc_to_words = [word_tokenize(sent) for sent in tokenized_doc_to_sentences]

reviews_corpus_lc = list(ios_data['review'])
reviews_corpus_lc = [review.lower() for review in reviews_corpus_lc]

tokenized_doc_to_sentences_lc = sent_tokenize(' '.join(reviews_corpus_lc))
tokenized_doc_to_words_lc = [word_tokenize(sent) for sent in tokenized_doc_to_sentences_lc]

reviews_corpus_stpw = list(ios_data['review'].apply(remove_stopwords_from_sentence))
tokenized_doc_to_sentences_stpw = sent_tokenize(' '.join(reviews_corpus_stpw))
tokenized_doc_to_words_stpw = [word_tokenize(sent) for sent in tokenized_doc_to_sentences_stpw]


reviews_corpus_noPnc = [remove_punc(review) for review in reviews_corpus_stpw] 
tokenized_doc_to_sentences_noPnc = [remove_punc(sentence) for sentence in tokenized_doc_to_sentences_stpw]
tokenized_doc_to_words_noPnc = [word_tokenize(sent) for sent in tokenized_doc_to_sentences_noPnc]


In [157]:
# SOURCE for code: https://www.analyticsvidhya.com/blog/2022/07/sentiment-analysis-using-python/

token = RegexpTokenizer(r'[a-zA-Z0-9]+') # get rid of punctuation and non alphanumeric characters + tokenize to word 

cv = CountVectorizer(stop_words = 'english', ngram_range = (1,1), tokenizer = token.tokenize) #count word matrix 

reviews_word_counts = cv.fit_transform(ios_data['review'])




In [158]:
x_train, x_test, y_train, y_test = train_test_split(reviews_word_counts, ios_data['man_label'], test_size = 0.2, random_state = 11)

In [159]:
multinomialNB = MultinomialNB()
multinomialNB.fit(x_train, y_train)

In [161]:
multinomialNB_predicted = multinomialNB.predict(x_test)
accuracy_multinomialNB = metrics.accuracy_score(multinomialNB_predicted, y_test)

print("Multinomial NB Accuracy ", accuracy_multinomialNB)

Multinomial NB Accuracy  0.6675663517768781


##### LSTM

Source for code was originally [AnalyticsVidhya](https://www.analyticsvidhya.com/blog/2022/07/sentiment-analysis-using-python/), but the code used several [depricated](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer) modules/packages/functions from packages. As well, keras is now integrated into tensorflow. So, I used ChatGPT's help. 

In [167]:
def cleaning(df, stop_words):
    df['review'] = df['review'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
    
    df['review'] = df['review'].apply(lambda x: re.sub(r'\d+', '', x))  # regex for digits
    
    df['review'] = df['review'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    df['review'] = df['review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
    
    return df


In [168]:
ios_cleaned = cleaning(ios_data, stop_words)

In [169]:
ios_cleaned.head()

Unnamed: 0,rating,title,review,Bank,man_label,blob_labels,vader_labels
0,4,Could Use Improvements,"one possible account, select time need deposit...",BMO,positive,neutral,positive
1,4,Verify,suddenly asked verify device almost every time...,BMO,positive,neutral,positive
2,1,Works terrible on my new phone,"new phone, practically every time open asks ve...",BMO,negative,neutral,positive
3,1,Bare bones app,feature make stand amongst canadian banks. poo...,BMO,negative,neutral,negative
4,1,Not working with iOS12.2,won’t work ios.. every time open turn white st...,BMO,negative,neutral,negative


In [170]:
tokenizer = Tokenizer(num_words = 500, split = ' ')
tokenizer.fit_on_texts(ios_cleaned['review'].values)

X = tokenizer.texts_to_sequences(ios_cleaned['review'].values)
X = pad_sequences(X)

y = pd.get_dummies(ios_cleaned['man_label']).values  

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [176]:
from tensorflow.keras.layers import LeakyReLU

# Model Building
model = Sequential()
model.add(Embedding(input_dim=500, output_dim=120, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(704, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(352))  
model.add(LeakyReLU(alpha=0.3))   
model.add(Dense(3, activation='softmax'))  
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

None


In [177]:
model.fit(x_train, y_train, epochs=20, batch_size=32, verbose=1)

Epoch 1/20
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 731ms/step - accuracy: 0.5607 - loss: 0.9549
Epoch 2/20
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 744ms/step - accuracy: 0.6682 - loss: 0.7641
Epoch 3/20
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 789ms/step - accuracy: 0.6936 - loss: 0.7198
Epoch 4/20
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 829ms/step - accuracy: 0.7086 - loss: 0.6810
Epoch 5/20
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 809ms/step - accuracy: 0.7081 - loss: 0.6719
Epoch 6/20
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 833ms/step - accuracy: 0.7135 - loss: 0.6458
Epoch 7/20
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 807ms/step - accuracy: 0.7264 - loss: 0.6279
Epoch 8/20
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 870ms/step - accuracy: 0.7387 - loss: 0.6205
Epoch 9/

<keras.src.callbacks.history.History at 0x7fe2829d9c90>

In [178]:
model.evaluate(x_test, y_test)

[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 155ms/step - accuracy: 0.6127 - loss: 1.0943


[1.1226282119750977, 0.6320288181304932]

###### A LITTLE BACK AND FORTH WITH CHATGPT 
1. `input_dim = 500`:<br>
    This represents the size of the vocabulary or the number of unique tokens (words) that the Embedding layer will handle.
    In your code, the tokenizer is set to consider the 500 most frequent words (`num_words=500`). 
    This means you want to limit the vocabulary to the top 500 most frequent words found in your dataset.
    The `input_dim` should match this value.
    
    > Why 500? <br>
    
    This is a commonly used number to control the size of the vocabulary. 
    It's a trade-off between having enough words to cover important vocabulary and not making the model too large or slow by using an excessively large vocabulary. 
    You can adjust this value based on the size of your dataset and the problem you're working on.

    > `tokenizer = Tokenizer(num_words=500, split=' ')`

    `input_dim` should be set to `num_words + 1`, but we typically set `input_dim` to 500 if we are using the top 500 words.
    If you increase the value of `num_words`, you can make your model handle a larger vocabulary (**but keep in mind that this increases memory usage and computational time**).

2. `output_dim = 120`:<br>
    This refers to the dimensionality of the word embeddings.
    Word embeddings are continuous vector representations of words, and `output_dim` specifies the size of each word’s embedding vector.
   
    > Why 120?: <br>
        
    The value of `output_dim` is usually chosen based on empirical results or experimentation. 
    Typical values range from 50 to 300.
    Larger values (like 120, 300) usually capture more semantic information about words but require more computational resources.
    Setting `output_dim = 120` is a common choice, as it strikes a balance between capturing enough semantic information and managing memory usage. 
    However, this is a hyperparameter that you can tune based on your dataset and model performance.
    If you use pre-trained word embeddings (like GloVe, Word2Vec), `output_dim` is typically determined by the size of the pre-trained embeddings.
    If you are learning embeddings from scratch, you can experiment with different sizes.

3. Example of embedding setup:

`model.add(Embedding(input_dim=500, output_dim=120, input_length=X.shape[1]))`

    `input_dim=500`: You’re working with a vocabulary of the top 500 words, so the model will have embeddings for each of those 500 words.\\
    `output_dim=120`: Each word will be represented by a 120-dimensional vector in the embedding space.

**When to adjust these values:**

    `input_dim`: If you want your model to understand more than the top 500 words, you can increase this number. 
    However, increasing it might also make the model slower and more memory-intensive.

    `output_dim`: You can experiment with this value (e.g., 50, 100, 200) depending on how much information you want each word's embedding to capture. Larger values may improve the model’s ability to understand word relationships but at the cost of more computation.

***

While the epochs are running, I (Yekta) read the code in more detail and learned a few things. I'll write them down: 

1. `Sequential()` generates a model which is built layer by layer (in "sequence")
2. `.add(Embedding)` is the embedding layer, which basically replaced the Word2Vec or other word embeddings - which is basically the semantic representation of words (to find similar words). In here, the `input_length` is the length of the X value, which is the number of words per review
3. `SpatialDropout1D` is a regularization technique that drops 40% of the features (words here) to not overfit 
4. `LSTM` is obvious Long Short-Term Memory with 704 neurons. Why 704? Probably similar to above rational, drops 20% of the words to not overfit, and not sure about LSTM's internal memory but the `recurrent_dropout` is to avoid overfitting there
5. `Dense(352)` is the first fully connected layer with 352 neurons, and it uses a `LeakyReLU` activation function (which I had to add as a separate layer)
6. `Dense(3)` is the last layer, which gives me probabilities for positive, negative and neutral labels and uses `softmax`
7. `compile` determines the loss function (`categorical_crossentropy`: because it's multiclass classification), `optimizer`: adam, to tune the weights during learning to minimize loss, and `accuracy` for the performance measure)

##### WHAT IF I WANTED TO USE WORD2VEC or BERT

In [None]:

# -------------------------- START OF CHATGPT CODE 
# Step 1: Load pre-trained Word2Vec model (Google's pre-trained model is large)
# w2v_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# # Step 2: Create an embedding matrix (size: vocabulary size x embedding dimension)
# embedding_dim = 300  # Typically, Word2Vec uses 300-dimensional vectors
# embedding_matrix = np.zeros((500, embedding_dim))  # Initialize a matrix for top 500 words

# # Map each word in the vocabulary to its Word2Vec vector (if available)
# for word, i in tokenizer.word_index.items():
#     if i < 500:  # We are using the top 500 words
#         if word in w2v_model:
#             embedding_matrix[i] = w2v_model[word]

# # Step 3: Replace the embedding layer with the pre-trained Word2Vec embeddings
# model_word2Vec = Sequential()
# model_word2Vec.add(Embedding(input_dim=500, output_dim=embedding_dim, input_length=X.shape[1], 
#                     weights=[embedding_matrix], trainable=False))  # Freezing the embeddings
# model_word2Vec.add(SpatialDropout1D(0.4))
# model_word2Vec.add(LSTM(704, dropout=0.2, recurrent_dropout=0.2))
# model_word2Vec.add(Dense(352, activation=LeakyReLU(alpha=0.3)))
# model_word2Vec.add(Dense(3, activation='softmax'))
# model_word2Vec.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# -------------------------- END OF CHATGPT CODE 


In [None]:
model_word2Vec.fit(x_train, y_train, epochs=10, batch_size=32, verbose=1)

In [None]:
model_word2Vec.evaluate(x_test, y_test)

In [None]:


# -------------------------- START OF CHATGPT CODE 

# # Step 1: Load BERT model and tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# # Step 2: Tokenize the sentences and pad sequences
# inputs = tokenizer(ios_cleaned['review'].tolist(), padding=True, truncation=True, return_tensors='tf')

# # Step 3: Use BERT embeddings for input
# model_BERT = Sequential()
# model_BERT.add(tf.keras.layers.InputLayer(input_shape=(inputs['input_ids'].shape[1],)))  # Input layer size should match tokenized input length
# model_BERT.add(bert_model)  # Apply the BERT model to get embeddings
# model_BERT.add(SpatialDropout1D(0.4))
# model_BERT.add(LSTM(704, dropout=0.2, recurrent_dropout=0.2))
# model_BERT.add(Dense(352, activation=LeakyReLU(alpha=0.3)))
# model_BERT.add(Dense(3, activation='softmax'))
# model_BERT.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# -------------------------- END OF CHATGPT CODE 


In [None]:
# model_BERT.fit(x_train, y_train, epochs=10, batch_size=32, verbose=1)

In [None]:
# model_BERT.evaluate(x_test, y_test)

In [179]:
# label using the model 
X_ = tokenizer.texts_to_sequences(ios_cleaned['review'].values)
X_ = pad_sequences(X_, maxlen = X.shape[1])

predictions_model = model.predict(X_)

label_map = {0: 'negative',
             1: 'neutral',
             2: 'positive'}

predicted_labels_model = [label_map[np.argmax(pred)] for pred in predictions_model]

ios_cleaned['LSTM_label'] = pd.Series(predicted_labels_model)

[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 186ms/step


In [180]:
# split the data into 3 
negative_data_ios = ios_cleaned[ios_cleaned['LSTM_label'] == "negative"]
neutral_data_ios = ios_cleaned[ios_cleaned['LSTM_label'] == "neutral"]
positive_data_ios = ios_cleaned[ios_cleaned['LSTM_label'] == "positive"]

negative_data_ios.to_csv("negative_data_ios.csv", index=False)
neutral_data_ios.to_csv("neutral_data_ios.csv", index=False)
positive_data_ios.to_csv("positive_data_ios.csv", index=False)