In [None]:
import pandas as pd
import seaborn as sns
import re,json,nltk
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import warnings
warnings.filterwarnings("ignore")
import os

In [None]:
pip install openpyxl

In [None]:

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df=pd.read_excel('/kaggle/input/3-classes-bangla-sa/3_classes_Bert_Preprocessed__Dataset.xlsx')
df.info()

In [None]:
df.head()

In [None]:
sns.countplot(df['Label'])

In [None]:
df.Label.value_counts(), len(df)

In [None]:
df=df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df_filtered = df[df['Label'] > 0]
df=df_filtered

In [None]:
sns.countplot(df['Label'])

In [None]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear','sigmoid'],
            'gamma':['auto','scale'],
            'decision_function_shape':['ovo', 'ovr']

        }  
    },
    
    'decision tree':{
        'model':DecisionTreeClassifier(),
        'params':{
            'max_depth' : [125,500,700,900],
            'max_features' : ["sqrt","log2"],
            'min_samples_split' : [15,55,95],
            'criterion':['entropy']
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10,25],
            #'weights': ['uniform','distance']
            
        }
    },
    'multinomial naive bayes':{
        'model':MultinomialNB(),
        'params':{
            'alpha':[.10,.25,.40,.75,.90]
        }
    },
    'k nearest neighbors':{
        'model':KNeighborsClassifier(),
        'params':{
            'n_neighbors': [3,4,5,6,7],
            'weights': ['uniform','distance'], 
            'algorithm' : ['ball_tree', 'kd_tree', 'brute']

            
        }
    },
    'Stochastic Gradient Descent':{
        'model':SGDClassifier(),
        'params':{
            'loss':['log'],
            'penalty':['l2', 'l1', 'elasticnet'], 
            'alpha':[.0001,.0005,.0009,.0012]

        }
    }
}

In [None]:
scores_function = ['precision_macro','recall_macro','f1','accuracy']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
x_bow = vect.fit_transform(df['Token'])
x_train, x_test, y_train, y_test = train_test_split(x_bow,df['Label'], test_size=0.2, random_state=42)

In [None]:
categories=[1,2]

In [None]:
scores = []
for model_name, mp in model_params.items():
  for score in scores_function:
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, scoring=score)
    clf.fit(x_train,y_train)
    
  scores.append({
            'model': model_name,
            'best_params': clf.best_params_,
            'precision':precision_score(y_test,clf.predict(x_test),average='macro'),
            'recall':recall_score(y_test,clf.predict(x_test),average='macro'),
            'f1':f1_score(y_test,clf.predict(x_test),average='macro'),
            'accuracy':accuracy_score(y_test,clf.predict(x_test))
        })
    
df_score = pd.DataFrame(scores,columns=['model','best_params','precision','recall','f1','accuracy'])
df_score.to_excel('2 classes BOW ML parameters.xlsx')
df_score.sort_values(by=['accuracy'],ascending=False) 



In [None]:
y_pred=clf.best_estimator_.predict(x_test)
y_pred = y_pred.reshape(-1,1)

import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.metrics import confusion_matrix



cm = confusion_matrix(y_test, y_pred)


labels = 2

class_names = categories 

fig = plt.figure(figsize=(16, 14))
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt = 'g');

ax.set_xlabel('Predicted', fontsize=20)
ax.xaxis.set_label_position('bottom')
plt.xticks(rotation=90)
ax.xaxis.set_ticklabels(class_names, fontsize = 10)
ax.xaxis.tick_bottom()

ax.set_ylabel('True', fontsize=20)
ax.yaxis.set_ticklabels(class_names, fontsize = 10)
plt.yticks(rotation=0)

plt.title('Refined Confusion Matrix', fontsize=20)

plt.savefig('ConMat24.png')
plt.show()

In [None]:
#TF-IDF unigram
tfidf_unigram = TfidfVectorizer(use_idf=True,tokenizer=lambda x: x.split()) 
x_tfidf_unigram= tfidf_unigram.fit_transform(df['Token'])
x_train, x_test, y_train, y_test = train_test_split(x_tfidf_unigram,df['Label'], test_size=0.2, random_state=42)

In [None]:
scores = []
for model_name, mp in model_params.items():
  for score in scores_function:
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, scoring=score)
    clf.fit(x_train,y_train)
    
  scores.append({
            'model': model_name,
            'best_params': clf.best_params_,
            'precision':precision_score(y_test,clf.predict(x_test),average='macro'),
            'recall':recall_score(y_test,clf.predict(x_test),average='macro'),
            'f1':f1_score(y_test,clf.predict(x_test),average='macro'),
            'accuracy':accuracy_score(y_test,clf.predict(x_test))
        })
    
df_score = pd.DataFrame(scores,columns=['model','best_params','precision','recall','f1','accuracy'])
df_score.to_excel('2 classes TF-IDF unigram ML parameters.xlsx')
df_score.sort_values(by=['accuracy'],ascending=False) 


In [None]:
y_pred=clf.best_estimator_.predict(x_test)
y_pred = y_pred.reshape(-1,1)
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.metrics import confusion_matrix



cm = confusion_matrix(y_test, y_pred)


labels = 2

class_names = categories 

# Plot confusion matrix in a beautiful manner
fig = plt.figure(figsize=(16, 14))
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt = 'g'); #annot=True to annotate cells
# labels, title and ticks
ax.set_xlabel('Predicted', fontsize=20)
ax.xaxis.set_label_position('bottom')
plt.xticks(rotation=90)
ax.xaxis.set_ticklabels(class_names, fontsize = 10)
ax.xaxis.tick_bottom()

ax.set_ylabel('True', fontsize=20)
ax.yaxis.set_ticklabels(class_names, fontsize = 10)
plt.yticks(rotation=0)

plt.title('Refined Confusion Matrix', fontsize=20)

plt.savefig('ConMat24.png')
plt.show()

In [None]:
# TF-IDF Bigram
tfidf_bigram = TfidfVectorizer(ngram_range=(1,2),use_idf=True,tokenizer=lambda x: x.split()) 
x_tfidf_bigram = tfidf_bigram.fit_transform(df['Token'])
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_tfidf_bigram,df['Label'], test_size=0.2, random_state=42)

In [None]:
scores = []
for model_name, mp in model_params.items():
  for score in scores_function:
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, scoring=score)
    clf.fit(x_train,y_train)
    
  scores.append({
            'model': model_name,
            'best_params': clf.best_params_,
            'precision':precision_score(y_test,clf.predict(x_test),average='macro'),
            'recall':recall_score(y_test,clf.predict(x_test),average='macro'),
            'f1':f1_score(y_test,clf.predict(x_test),average='macro'),
            'accuracy':accuracy_score(y_test,clf.predict(x_test))
        })
    
df_score = pd.DataFrame(scores,columns=['model','best_params','precision','recall','f1','accuracy'])
df_score.to_excel('2 classes TF-IDF Bigram ML parameters.xlsx')
df_score.sort_values(by=['accuracy'],ascending=False) 


In [None]:
y_pred=clf.best_estimator_.predict(x_test)
y_pred = y_pred.reshape(-1,1)
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.metrics import confusion_matrix



cm = confusion_matrix(y_test, y_pred)

labels = 2

class_names = categories 


fig = plt.figure(figsize=(16, 14))
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt = 'g'); #annot=True to annotate cells
# labels, title and ticks
ax.set_xlabel('Predicted', fontsize=20)
ax.xaxis.set_label_position('bottom')
plt.xticks(rotation=90)
ax.xaxis.set_ticklabels(class_names, fontsize = 10)
ax.xaxis.tick_bottom()

ax.set_ylabel('True', fontsize=20)
ax.yaxis.set_ticklabels(class_names, fontsize = 10)
plt.yticks(rotation=0)

plt.title('Refined Confusion Matrix', fontsize=20)

plt.savefig('ConMat24.png')
plt.show()

In [None]:
#TF-IDF Trigram
tfidf_trigram = TfidfVectorizer(ngram_range=(1,3),use_idf=True,tokenizer=lambda x: x.split()) 
x_tfidf_trigram = tfidf_trigram.fit_transform(df['Token'])
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_tfidf_trigram,df['Label'], test_size=0.2, random_state=42)

In [None]:
scores = []
for model_name, mp in model_params.items():
  for score in scores_function:
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, scoring=score)
    clf.fit(x_train,y_train)
    
  scores.append({
            'model': model_name,
            'best_params': clf.best_params_,
            'precision':precision_score(y_test,clf.predict(x_test),average='macro'),
            'recall':recall_score(y_test,clf.predict(x_test),average='macro'),
            'f1':f1_score(y_test,clf.predict(x_test),average='macro'),
            'accuracy':accuracy_score(y_test,clf.predict(x_test))
        })
    
df_score = pd.DataFrame(scores,columns=['model','best_params','precision','recall','f1','accuracy'])
df_score.to_excel('2 classes TF-IDF Trigram ML parameters.xlsx')
df_score.sort_values(by=['accuracy'],ascending=False) 


In [None]:
y_pred=clf.best_estimator_.predict(x_test)
y_pred = y_pred.reshape(-1,1)
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

labels = 2

class_names = categories 


fig = plt.figure(figsize=(16, 14))
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt = 'g'); 

ax.set_xlabel('Predicted', fontsize=20)
ax.xaxis.set_label_position('bottom')
plt.xticks(rotation=90)
ax.xaxis.set_ticklabels(class_names, fontsize = 10)
ax.xaxis.tick_bottom()

ax.set_ylabel('True', fontsize=20)
ax.yaxis.set_ticklabels(class_names, fontsize = 10)
plt.yticks(rotation=0)

plt.title('Refined Confusion Matrix', fontsize=20)

plt.savefig('ConMat24.png')
plt.show()