# Product Sentiment Analysis Classification


In this notebook we will be doing sentiment classification in python using Classical ML techniques:
1. LOGISTIC REGRESSION (Implemented)
2. LSTM(To be Done)
3. LLM(To be Done)

# Step 0. Read in Data

In [1]:
def prepare_data(file_location):
    import warnings
    warnings.filterwarnings('ignore')
    import pandas as pd 
    print("\n----Inside prepare_data Stage----\n")
    # Load in dataset
    column_names = ['Review_UID', 'Product', 'Sentiment', 'Review']
    df = pd.read_csv(file_location,header=None,names=column_names)
    print("Raw Dataset info:")
    print(df.shape)
    df.info()
    df = df.dropna()
    print("\n----Dataset after dropping null info:----\n")
    print(df.shape)
    df.info()
    print("\n----End of prepare_data Stage----\n")
    return df


## Quick EDA

In [2]:
def quick_eda(df):
    import matplotlib.pyplot as plt
    plt.style.use('ggplot')
    print("\n----Inside EDA Stage----\n")
    print("Plotting count of reviewy by product")
    ax = df['Product'].value_counts().sort_index() \
        .plot(kind='bar',
            title='Count of Reviews by Products',
            figsize=(10, 5))
    ax.set_xlabel('Review Products')
    plt.show()
    
    print("Plotting count of reviewy by product")
    scores = df['Sentiment'].value_counts()
    plt.pie(df['Sentiment'].value_counts(), 
            labels=scores.index, 
            autopct='%1.1f%%', 
            radius=2, 
            colors=['#3bccff','#ff3333', '#ffd400','#014421'])
    plt.show()
    print("\n----End of eda Stage----\n")

# Step 1. Data Cleaning

#### Helper Functions

In [3]:
#This function converts to lower-case, removes square bracket, removes numbers and punctuation
def text_normalize_basic(text):
    import re
    import string
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    #Basics
    text = text.lower() # transform text to lowercase
    text = re.sub(r'\[.*?\]', '', text) # remove '[]' and words in it
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # remove links
    text = re.sub(r'<.*?>+', '', text) # remove '<>' and words in it
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  #remove punctuation
    text = re.sub(r'[‘’“”…]', '', text) # remove "" and words in it
    text = re.sub(r'\n', '', text) # remove line feeds
    text = re.sub(r'\w*\d\w*', '', text) # remove numbers and words with numbers
    # Remove Stopwords
    text = [word for word in text.split(' ') if word not in stopwords] # remove stopwords
    text=' '.join(text) # put words back together to form a text
    return text
def text_normalize_stemming(text):
    import nltk
    stemmer = nltk.SnowballStemmer('english')
    #stemmer = nltk.porter.PorterStemmer()
    text = [stemmer.stem(word) for word in text.split(' ')] # stem the text
    text=" ".join(text) # put words back together to form a text
    return text
def text_normalize_emoji(text):
    import emoji
    # Custom processing
    text= emoji.demojize(text, delimiters=("", ""))
    return text
def text_normalize_lemmmatize(text):
    from nltk.stem import WordNetLemmatizer
    # Intilize Lemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    #Lemmatization
    # Apply 
    text = ' '.join([wordnet_lemmatizer.lemmatize(word , pos='v') for word in text.split(' ')])
    return text


def data_normalization(df):    
    
    import pandas as pd
    import numpy as np
    import nltk

    nltk.download('stopwords')
    nltk.download('wordnet')

    cleaned = lambda x: text_normalize_basic(x)
    stemmed = lambda x: text_normalize_stemming(x)
    emojised = lambda x: text_normalize_emoji(x)
    lemmatized = lambda x: text_normalize_lemmmatize(x)
    
    # Apply Normalization
    df['cleaned_review'] = pd.DataFrame(df.Review.apply(cleaned))
    df['cleaned_review']  = df['cleaned_review'].apply(lambda x: " ".join(x.split()) if isinstance(x, str) else x)
    
    print("\n----Dataset after cleaning info:----\n")
    print("Cleaned Dataset info:")
    print(df.shape)
    df.info()
    
    print("\n----Removing null or spaces in Dataset after cleaning info:----\n")
    df = df.replace(r'^\s*$',float('NaN'),regex=True)
    #df['cleaned_review'].replace(" ", np.nan, inplace=True)
    df.dropna(inplace=True)
    print("\n----Cleaned Dataset after dropping null info:----\n")
    print(df.shape)
    df.info()
    
    df['stemmed_review'] = pd.DataFrame(df.cleaned_review.apply(stemmed))
    print("\n----Removing null or spaces in Dataset after stemming info:----\n")
    df['stemmed_review'].replace(" ", np.nan, inplace=True)
    df.dropna(subset=['stemmed_review'], inplace=True)
    print("\n----Cleaned Dataset after dropping null info:----\n")
    print(df.shape)
    
    df['lemmatized_review'] = pd.DataFrame(df.stemmed_review.apply(lemmatized))
    print("\n----Removing null or spaces in Dataset after lemmatizing info:----\n")
    df['lemmatized_review'].replace(" ", np.nan, inplace=True)
    df.dropna(subset=['lemmatized_review'], inplace=True)
    print("\n----Cleaned Dataset after dropping null info:----\n")
    print(df.shape)
    df['emojised_review'] = pd.DataFrame(df.lemmatized_review.apply(emojised))
    
    # Let's take a look at the updated text

    df_processed = df[["Review_UID","Product","Sentiment","Review","cleaned_review","stemmed_review","lemmatized_review","emojised_review"]]
    print("The processed dataset created is :")
    print(df_processed.head(5))
    return df_processed

In [4]:
def preprocess_data(df):
    
    import pandas as pd
    
    print("\n----Inside preprocess_data Stage----\n")
    
    ### Checking Missing values in the Data Set and printing the Percentage for Missing Values for Each Columns ###

    count = df.isnull().sum().sort_values(ascending=False)
    percentage = ((df.isnull().sum()/len(df)*100)).sort_values(ascending=False)
    missing_data = pd.concat([count, percentage], axis=1,
    keys=['Count','Percentage'])

    print('Count and percentage of missing values for the columns:')
    missing_data
    print('Dataset info before dropna:')
    print(df.shape)
    df.info()
    
    #Drop Rows with null columns
    df.dropna(subset=['Review'], inplace=True)
    # reset index just to be safe
    df.reset_index(drop=True, inplace=True)
    print('Dataset info after dropna:')
    print(df.shape)
    df.info()
    
    df = df.drop_duplicates()
    df.reset_index(drop=True, inplace=True)
    print('Dataset info after drop duplicate:')
    print(df.shape)
    df.info()
    
    print("---\nRunning Normalization on the Dataset...../n ")
    df = data_normalization(df)
   
    #Drop Rows with null columns
    df = df.dropna()
    # reset index just to be safe
    df.reset_index(drop=True, inplace=True)
    print('Normalized Dataset info after dropna:')
    print(df.shape)
    df.info()
    df.to_csv('../data/processed/training.csv')
    print("\n----End of preprocess_data Stage----\n")
    return df

In [5]:
def train_test_split(df):
    from sklearn.model_selection import train_test_split
    print("The datset for training is:")
    print(df.shape)
    df.info()
    df.head(10)    

    Independent_var = df.cleaned_review
    Dependent_var = df.Sentiment

    IV_train, IV_test, DV_train, DV_test = train_test_split(Independent_var, Dependent_var, test_size = 0.2, random_state = 225)

    print('IV_train :', len(IV_train))
    print('IV_test  :', len(IV_test))
    print('DV_train :', len(DV_train))
    print('DV_test  :', len(DV_test))

    IV_train.to_csv(f'../data/interim/X_train.csv', index=False)
    IV_test.to_csv(f'../data/interim/X_test.csv', index=False)
    DV_train.to_csv(f'../data/interim/y_train.csv', index=False)
    DV_test.to_csv(f'../data/interim/y_test.csv', index=False)
def training_classifier():
    import pandas as pd
    import numpy as np
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression

    tvec = TfidfVectorizer()
    clf2 = LogisticRegression(solver = "lbfgs")


    from sklearn.pipeline import Pipeline
    IV_train= pd.read_csv('../data/interim/X_train.csv')
    DV_train= pd.read_csv('../data/interim/y_train.csv')
    
    model = Pipeline([('vectorizer',tvec),('classifier',clf2)])

    model.fit(IV_train['cleaned_review'], DV_train['Sentiment'])

    import pickle
    with open(f'../data/external/model.pkl', 'wb') as f:
        pickle.dump(model, f)

    print("\n logistic regression classifier is trained on sentiment data and saved to PV location ../external/model.pkl ----")
def predict_test_data():
    import pandas as pd
    import numpy as np
    import pickle
    from sklearn.metrics import confusion_matrix
    
    print("----Inside predict_test_data stage ----")
    with open(f'../data/external/model.pkl', 'rb') as f:
        model = pickle.load(f)

    IV_test= pd.read_csv('../data/interim/X_test.csv')
    DV_test= pd.read_csv('../data/interim/y_test.csv')
    predictions = model.predict(IV_test['cleaned_review'])
    np.save(f'../data/external/y_test_pred.npy', predictions)

    confusion_matrix(predictions, DV_test)
def predict_validation_data():
    import pandas as pd
    import numpy as np
    import pickle
    from sklearn.metrics import confusion_matrix
    
    print("----Inside predict_test_data stage ----")
    with open(f'../data/external/model.pkl', 'rb') as f:
        model = pickle.load(f)

    IV_test= pd.read_csv('../data/interim/X_valid.csv')
    DV_test= pd.read_csv('../data/interim/y_valid.csv')
    predictions = model.predict(IV_test)
    np.save(f'../data/external/y_valid_pred.npy', predictions)

    confusion_matrix(predictions, DV_test)
def get_metrics( test_type):
    import pandas as pd
    import numpy as np
    from sklearn.metrics import accuracy_score, precision_score, recall_score
    
    variables = {"test": ["../data/external/y_test_pred.npy", "../data/interim/y_test.csv"], "valid": ["../data/external/y_valid_pred.npy", "../data/interim/y_valid.csv"]}
    predictions_csv = ""
    dv_test_csv = ""
    
    if test_type == 'test' :
        predictions_csv = variables['test'][0]
        dv_test_csv = variables['test'][1]
    elif test_type == 'valid':
        predictions_csv = variables['valid'][0]
        dv_test_csv = variables['valid'][1]
    else:
        print("Invalid option")
    print(predictions_csv)
    print("Above is path variable")   
    predictions= np.load(f'{predictions_csv}', allow_pickle=True)
    DV_test= pd.read_csv(f'{dv_test_csv}')

    print("Accuracy : ", accuracy_score(predictions, DV_test))
    print("Precision : ", precision_score(predictions, DV_test, average = 'weighted'))
    print("Recall : ", recall_score(predictions, DV_test, average = 'weighted'))

# Main : Load training data and train the model

In [6]:
loaeded_dataset=prepare_data('../data/raw/training.csv')

quick_eda(loaeded_dataset)
processed_data=preprocess_data(loaeded_dataset)
df=processed_data

train_test_split(df)
training_classifier()
predict_test_data()
get_metrics('test')

#####
#example = ["The professional dota 2 scene is fucking exploding and I completely welcome it."]
#result = model.predict(example)

#print(result)


----Inside prepare_data Stage----

Raw Dataset info:
(74682, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Review_UID  74682 non-null  int64 
 1   Product     74682 non-null  object
 2   Sentiment   74682 non-null  object
 3   Review      73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB

----Dataset after dropping null info:----

(73996, 4)
<class 'pandas.core.frame.DataFrame'>
Index: 73996 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Review_UID  73996 non-null  int64 
 1   Product     73996 non-null  object
 2   Sentiment   73996 non-null  object
 3   Review      73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.8+ MB

----End of prepare_data Stage----


----Inside preprocess_data Stage----

Count and perc

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vbiswas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/vbiswas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



----Dataset after cleaning info:----

Cleaned Dataset info:
(71656, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71656 entries, 0 to 71655
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Review_UID      71656 non-null  int64 
 1   Product         71656 non-null  object
 2   Sentiment       71656 non-null  object
 3   Review          71656 non-null  object
 4   cleaned_review  71656 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.7+ MB

----Removing null or spaces in Dataset after cleaning info:----


----Cleaned Dataset after dropping null info:----

(69840, 5)
<class 'pandas.core.frame.DataFrame'>
Index: 69840 entries, 0 to 71655
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Review_UID      69840 non-null  int64 
 1   Product         69840 non-null  object
 2   Sentiment       69840 non-null  object
 3   Revie

# Model Validation

In [7]:
validation_dataset=prepare_data('../data/raw/validation.csv')


----Inside prepare_data Stage----

Raw Dataset info:
(1000, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Review_UID  1000 non-null   int64 
 1   Product     1000 non-null   object
 2   Sentiment   1000 non-null   object
 3   Review      1000 non-null   object
dtypes: int64(1), object(3)
memory usage: 31.4+ KB

----Dataset after dropping null info:----

(1000, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Review_UID  1000 non-null   int64 
 1   Product     1000 non-null   object
 2   Sentiment   1000 non-null   object
 3   Review      1000 non-null   object
dtypes: int64(1), object(3)
memory usage: 31.4+ KB

----End of prepare_data Stage----



In [8]:

def prepare_validation_data():
    import pandas as pd
    cleaned_text= lambda x: text_normalize_basic(x)
    
    # Apply Normalization
    validation_dataset['cleaned_review'] = pd.DataFrame(validation_dataset.Review.apply(cleaned_text))
    validation_dataset_X = validation_dataset['cleaned_review']
    validation_dataset_y = validation_dataset['Sentiment']

    validation_dataset_X.to_csv(f'../data/interim/X_valid.csv', index=False)
    validation_dataset_y.to_csv(f'../data/interim/y_valid.csv', index=False)
    
def predict_valid_data():
    import numpy as np
    import pandas as pd
    import pickle
    print("----Inside predict_valid_data stage ----")
    with open(f'../data/external/model.pkl', 'rb') as f:
        model = pickle.load(f)

    IV_test= pd.read_csv('../data/interim/X_valid.csv')

    predictions = model.predict(IV_test['cleaned_review'].values.astype('U'))
    np.save(f'../data/external/y_valid_pred.npy', predictions)

    df_predictions = pd.DataFrame(predictions)
    df_predictions.to_csv('../data/external/y_valid_pred.csv')
    print("----Successfully predicted in y_valid_pred.csv file ----")


In [10]:
prepare_validation_data()
predict_valid_data()
get_metrics('valid')

----Inside predict_valid_data stage ----
----Successfully predicted in y_valid_pred.csv file ----
../data/external/y_valid_pred.npy
Above is path variable
Accuracy :  0.872
Precision :  0.8765229556449262
Recall :  0.872
