In [None]:
#Imports
import numpy as np
import pandas as pd
import os

In [None]:
#Load transated data with sentiment calculated
import utils, importlib
importlib.reload(utils)
data1g = pd.read_csv("data"+os.sep+"part"+os.sep+"Suomi112_cpd_google.csv", encoding='utf-8-sig')
data1g.drop(['Unnamed: 0','userName', 'reviewId', 'at','reviewCreatedVersion','score','thumbsUpCount'], axis=1, inplace=True)
data1g['type'] = utils.souce_type_dict["Suomi112_google"]
data1a = pd.read_csv("data"+os.sep+"part"+os.sep+"Suomi112_cpd_apple.csv", encoding='utf-8-sig')
data1a.drop(['Unnamed: 0','title', 'date', 'rating'], axis=1, inplace=True)
data1a['type'] = utils.souce_type_dict["Suomi112_apple"]
data2g = pd.read_csv("data"+os.sep+"part"+os.sep+"SosLive_cpd_google.csv", encoding='utf-8-sig')
data2g.drop(['Unnamed: 0','userName', 'reviewId', 'at','reviewCreatedVersion','score','thumbsUpCount'], axis=1, inplace=True)
data2g['type'] = utils.souce_type_dict["SosLive_google"]
data2a = pd.read_csv("data"+os.sep+"part"+os.sep+"SosLive_cpd_apple.csv", encoding='utf-8-sig')
data2a.drop(['Unnamed: 0','title', 'date', 'rating','developerResponse'], axis=1, inplace=True)
data2a['type'] = utils.souce_type_dict["SosLive_apple"]
#Merge all sources into one big dataframe (the type attrib will remember the source)
data = pd.concat([data1g,data1a,data2g,data2a],ignore_index = True)

In [None]:
oneonetwo_data = pd.read_csv("data"+os.sep+"part"+os.sep+"Suomi112_cpd.csv", encoding='utf-8-sig')
sos_live = pd.read_csv("data"+os.sep+"part"+os.sep+"SosLive_cpd.csv", encoding='utf-8-sig')
data = pd.concat([oneonetwo_data,sos_live],ignore_index = True)

In [None]:
#Print the loaded data:
data

In [None]:
#Recode the sentiment
import utils, importlib
importlib.reload(utils)
# Remove Neutral Reviews:
data = data.drop(data[data["sentiment"] == 0].index)
# Recode the Negative sentiment from -1 to 0
data['sentiment'] = [sent if sent == 1 else 0 for sent in data['sentiment']]

In [None]:
#Preprocess the data
import utils, importlib
importlib.reload(utils) # to keep the .py file up to date when coding
processor = utils.Processor() # Init preprocessing <- there is a path to set up where to dowloads this v
processor.ini_dowload() #dowload all the nessesary files to do the preprocesing (like the dictionary of the stopwords)
data['content'] = data['content'].apply(lambda x: processor.preprocess(str(x))) #Used to get rid of the unnesesary characters and to stringify the emogis
data['content'] = data['content'].apply(lambda x: processor.tokenize(x)) # This is needed in the mext steps
data['content'] = data['content'].apply(lambda x: processor.remove_stopwords(x,remove_len=2)) # Remove stopwords (like in the topic task)
data['content'] = data['content'].apply(lambda x: processor.process_tokens(x))  # Strip the words into the root words
data['content'] = data['content'].apply(lambda x: processor.detokenize(x)) # Go back form tokens to text. Because sklearn Vektorizer neads text

In [None]:
#Check the distribution of the data
import utils, importlib
importlib.reload(utils)
utils.count_data_stat(data)

In [None]:
#make data more balanced - optional (or use weigths in the classifier)
#rem_idxs = data[data.sentiment==sentiment_dict["Positive"]].sample(frac = 0.5,random_state=0).index
#data = data.drop(rem_idxs)

In [None]:
#split the data to train i test sets
test = data.sample(frac = 0.2,random_state=0)
train = data.drop(test.index)
#print("Siema Janek")

In [None]:
#Print the test/train proportions:
print("Train: ",len(train.content))
utils.count_data_stat(train)
print("Test: ", len(test.content))
utils.count_data_stat(test)

In [None]:
#Vectorize the text into features model can understand:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
vectorizer = TfidfVectorizer(ngram_range = (2,3))
vectorizer.fit(list(train['content']))
train_X = vectorizer.transform(list(train['content']))
test_X = vectorizer.transform(list(test['content']))

In [None]:
#Choose and create classifier aka model aka clf:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
#Feel free to change the params of the model to make them better:
clf = RandomForestClassifier(n_estimators=1000, max_depth=200,min_samples_leaf=1,max_features="sqrt", criterion="gini", random_state=0, class_weight={0:1,1:0.5})
#clf = svm.SVC(kernel = "poly",degree=3, gamma="scale", C=1000,class_weight={0:1,1:0.5})
#And train it:
clf.fit(train_X, train['sentiment'])

In [None]:
#Perform the predicitons
#And print out the accuracies - mind the class unbalance!
from sklearn.metrics import accuracy_score
train_Y = clf.predict(train_X)
train_acc = accuracy_score(train['sentiment'],train_Y)
print("Training Acc = ",train_acc)

test_Y = clf.predict(test_X)
test_acc = accuracy_score(test['sentiment'],test_Y)
print("Test Acc = ",test_acc)

In [None]:
# Compute the confusion matrix function for training data and testing data respectively - best visualize of the model performance
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
labels = ['happy', 'sad']
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# Calculate the confusion matrix when classifying the training data
ax[0].title.set_text('Training data confusion matrix:')
cm = confusion_matrix(train['sentiment'],train_Y)
cmd = ConfusionMatrixDisplay(cm, display_labels=['Negative', 'Positive'])
cmd.plot(ax=ax[0])

# Calculate the confusion matrix when classifying the testing data
ax[1].title.set_text('Testing data confusion matrix:')
cm2 = confusion_matrix(test['sentiment'],test_Y)
cmd2 = ConfusionMatrixDisplay(cm2, display_labels=['Negative', 'Positive'])
cmd2.plot(ax=ax[1])

In [None]:
########Feature extraction:
#Create Feature selection tool:
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(clf,prefit=True, max_features = 200)

In [None]:
#Select N best features and print them:
from itertools import compress
features = pd.DataFrame()
features["idx"] = sel.get_support(indices=True)
features["importance"] = sel.estimator.feature_importances_[features["idx"]]
features["names"] =  np.array(vectorizer.get_feature_names_out())[features["idx"]]
features["prediction"] = clf.predict(vectorizer.transform(features["names"]))

In [None]:
features

In [None]:
########Feature extraction:
#Create Feature selection tool:
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(clf,prefit=True, max_features = 200)

In [None]:
#Select N best features and print them:
from itertools import compress
features = pd.DataFrame()
features["idx"] = sel.get_support(indices=True)
features["importance"] = sel.estimator.feature_importances_[features["idx"]]
features["names"] =  np.array(vectorizer.get_feature_names_out())[features["idx"]]
features["prediction"] = clf.predict(vectorizer.transform(features["names"]))

In [None]:
features

In [None]:
########Feature extraction:
#Create Feature selection tool:
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(clf,prefit=True, max_features = 200)

In [32]:
#Select N best features and print them:
from itertools import compress
features = pd.DataFrame()
features["idx"] = sel.get_support(indices=True)
features["importance"] = sel.estimator.feature_importances_[features["idx"]]
features["names"] =  np.array(vectorizer.get_feature_names_out())[features["idx"]]
features["prediction"] = clf.predict(vectorizer.transform(features["names"]))

In [33]:
features

Unnamed: 0,idx,importance,names,prediction
0,12,0.001220,abl choos,1
1,214,0.000815,accid situat,1
2,223,0.000976,accident call,1
3,1000,0.000741,announc corona,1
4,1047,0.000541,announc time,1
...,...,...,...,...
195,22342,0.000562,would like,1
196,22363,0.000619,would nice,1
197,22379,0.000860,would possibl,1
198,22455,0.000589,write number,1
