## Tasks 6 - Random Forest Classification
This notebook will train machine learning model using ngram representation of the reviews. To check the most important features in the reviews
### This notebook pipeline
- data loading
- remove neutral reviews
- data preprocessing
- train/test dataset splitting
- data vectorisation
- create random forest classifier
- train model
- evaluate model
- feature extraction


### Load the data:

In [None]:
#Imports
import numpy as np
import pandas as pd
import os

In [None]:
import utils, importlib
importlib.reload(utils)
oneonetwo_data = pd.read_csv("data"+os.sep+"part"+os.sep+"Suomi112_cpd.csv", encoding='utf-8-sig')
sos_live = pd.read_csv("data"+os.sep+"part"+os.sep+"SosLive_cpd.csv", encoding='utf-8-sig')
data = pd.concat([oneonetwo_data,sos_live],ignore_index = True)

In [None]:
#Print the loaded data:
data

In [None]:
# Print data stats:
import utils, importlib
importlib.reload(utils)
utils.count_data_stat(data,use_labels=False)

### Recode the sentiment:

In [None]:
#Recode the sentiment
import utils, importlib
importlib.reload(utils)
# Remove Neutral Reviews:
data = data.drop(data[data["sentiment"] == utils.sentiment_dict["Neutral"]].index)
# Recode the Negative sentiment from -1 to 0
data['sentiment'] = [utils.labels_dict["Positive"] if sent == utils.sentiment_dict["Positive"] else utils.labels_dict["Negative"] for sent in data['sentiment']]

### Preprocess the data:

In [None]:
#Preprocess the data
import utils, importlib
importlib.reload(utils) # to keep the .py file up to date when coding
processor = utils.Processor() # Init preprocessing <- there is a path to set up where to dowloads this v
processor.ini_dowload() #dowload all the nessesary files to do the preprocesing (like the dictionary of the stopwords)
data['content'] = data['content'].apply(lambda x: processor.preprocess(str(x))) #Used to get rid of the unnesesary characters and to stringify the emogis
data['content'] = data['content'].apply(lambda x: processor.tokenize(x)) # This is needed in the mext steps
data['content'] = data['content'].apply(lambda x: processor.not_no(x)) # This is needed in the mext steps
data['content'] = data['content'].apply(lambda x: processor.remove_stopwords(x,remove_len=3)) # Remove stopwords (like in the topic task)
data['content'] = data['content'].apply(lambda x: processor.process_tokens(x))  # Strip the words into the root words
data['content'] = data['content'].apply(lambda x: processor.detokenize(x)) # Go back form tokens to text. Because sklearn Vektorizer neads text

### Create the test/train subsets:

In [None]:
#Check the distribution of the data
import utils, importlib
importlib.reload(utils)
utils.count_data_stat(data)

In [None]:
#make data more balanced by removing additional data - optional (or use weigths in the classifier)
#rem_idxs = data[data.sentiment==sentiment_dict["Positive"]].sample(frac = 0.5,random_state=0).index
#data = data.drop(rem_idxs)

In [None]:
#split the data to train i test sets
test = data.sample(frac = 0.2,random_state=1)
train = data.drop(test.index)
#print("Siema Janek")

In [None]:
#Print the test/train proportions:
print("Train: ",len(train.content))
utils.count_data_stat(train)
print("Test: ", len(test.content))
utils.count_data_stat(test)

### Vectorise the reviews data:

In [None]:
#Vectorize the text into features model can understand:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
vectorizer = TfidfVectorizer(ngram_range = (2,3))
vectorizer.fit(list(train['content']))
train_X = vectorizer.transform(list(train['content']))
test_X = vectorizer.transform(list(test['content']))

### Machine Learning model:

In [None]:
#Choose and create classifier aka model aka clf:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
#Feel free to change the params of the model to make them better:
clf = RandomForestClassifier(n_estimators=350, max_depth=25,min_samples_leaf=2,min_samples_split=2,max_features="sqrt", criterion="gini", random_state=0,oob_score=False, class_weight={0:1,1:0.5})
#clf = svm.SVC(kernel = "poly",degree=3, gamma="scale", C=1000,class_weight={0:1,1:0.5})

### Training the machine learing model:

In [None]:
#And train it:
clf.fit(train_X, train['sentiment'])

### Model evaluation:

In [None]:
#Perform the predicitons - go with grand of salt - this in unbalanced dataset
#And print out the accuracies - mind the class unbalance!
from sklearn.metrics import accuracy_score
train_Y = clf.predict(train_X)
train_acc = accuracy_score(train['sentiment'],train_Y)
print("Training Acc = ",train_acc)

test_Y = clf.predict(test_X)
test_acc = accuracy_score(test['sentiment'],test_Y)
print("Test Acc = ",test_acc)

In [None]:
# Compute the confusion matrix function for training data and testing data respectively - best visualize of the model performance
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
labels = ['happy', 'sad']
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# Calculate the confusion matrix when classifying the training data
ax[0].title.set_text('Training data confusion matrix:')
cm = confusion_matrix(train['sentiment'],train_Y)
cmd = ConfusionMatrixDisplay(cm, display_labels=['Negative', 'Positive'])
cmd.plot(ax=ax[0])

# Calculate the confusion matrix when classifying the testing data
ax[1].title.set_text('Testing data confusion matrix:')
cm2 = confusion_matrix(test['sentiment'],test_Y)
cmd2 = ConfusionMatrixDisplay(cm2, display_labels=['Negative', 'Positive'])
cmd2.plot(ax=ax[1])

# Feature extraction:

In [None]:
#Create Feature selection tool:
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(clf,prefit=True, max_features = 200)

In [None]:
#Select N best features and print them:
from itertools import compress
features = pd.DataFrame()
features["idx"] = sel.get_support(indices=True)
features["importance"] = sel.estimator.feature_importances_[features["idx"]]
features["names"] =  np.array(vectorizer.get_feature_names_out())[features["idx"]]
features["prediction"] = clf.predict(vectorizer.transform(features["names"]))

In [None]:
#Print extracted features at the end:
features