# TFIDF - SVM

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# Import the dataset

In [3]:
emotion_dataset_train = pd.read_excel('drive/MyDrive/ShortPersianEmo 2023 Code/Data/ArmanEmo/train_fa.xlsx', header=None).dropna()
x_train = emotion_dataset_train[0]
y_train = emotion_dataset_train[1]

emotion_dataset_test = pd.read_excel('drive/MyDrive/ShortPersianEmo 2023 Code/Data/ArmanEmo/test_fa.xlsx', header=None).dropna()
x_test = emotion_dataset_test[0]
y_test = emotion_dataset_test[1]

In [4]:
x_train.shape

(6150,)

In [5]:
x_test.shape

(1151,)

In [6]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(pd.concat([y_train, y_test]))
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [7]:
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

{'ANGRY': 0, 'FEAR': 1, 'HAPPY': 2, 'HATE': 3, 'OTHER': 4, 'SAD': 5, 'SURPRISE': 6}


# Extract TFIDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = TfidfVectorizer()
# vectorizer = CountVectorizer()

vectorizer.fit(x_train)

X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)

feature_names = vectorizer.get_feature_names_out()

print(vectorizer.get_feature_names_out())

print(X_train.shape)

['000' '09015721193' '09198285759' ... 'ﻧﺎﺑﯿﻨﺎﯾﯽ' 'ﻧﻤﯿﮑﺸﻪ' 'ﻫﯿﮑﻠﺶ']
(6150, 22659)


# Scale Data

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
X_train.shape

(6150, 22659)

# Feature Selection

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

selector = SelectKBest(f_classif, k=300)
selector.fit(X_train, y_train)
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)


In [12]:
print(feature_names[selector.get_support()])

['آخه' 'آدمی' 'آمپول' 'آن' 'آنها' 'احساس' 'اخه' 'ادب' 'ارتفاع' 'اره' 'از'
 'ازت' 'ازدواج' 'ازش' 'ازشون' 'ازین' 'است' 'اعتراض' 'اعصابم' 'اعلام'
 'افتاد' 'اقتصادی' 'اما' 'امید' 'امیدوارم' 'انتخاب' 'انتقاد' 'انجام'
 'انقد' 'انقلاب' 'انگار' 'انگیز' 'اه' 'او' 'اگر' 'ایران' 'این' 'اینطوری'
 'اینقدر' 'اینکه' 'با' 'باشد' 'باشی' 'باشید' 'باید' 'بخدا' 'بخند' 'بخیر'
 'بدبختی' 'بدش' 'بدم' 'بدمون' 'بدنم' 'بر' 'برای' 'بزنم' 'بشم' 'بغض' 'به'
 'بهم' 'بودم' 'بی' 'بیزارم' 'بیشتر' 'تا' 'تاثیری' 'تبریک' 'تجاوز' 'تخمیه'
 'ترس' 'ترسم' 'ترسناك' 'ترسناک' 'ترسناکه' 'ترسناکی' 'ترسو' 'ترسیدم'
 'ترسیده' 'ترسیدی' 'ترسیدیم' 'تعجب' 'تف' 'تلاش' 'تنفر' 'تنفرم' 'تنگ' 'تو'
 'تولدت' 'جالب' 'جالبه' 'جرات' 'جنازه' 'حال' 'حالم' 'حجاب' 'حروم' 'حس'
 'حقوق' 'حمایت' 'خاطره' 'خانواده' 'خبر' 'خدایا' 'خر' 'خرد' 'خریدم' 'خشم'
 'خشونت' 'خنده' 'خندون' 'خندیدم' 'خوب' 'خوبی' 'خودم' 'خورد' 'خورده' 'خوش'
 'خوشحال' 'خونه' 'خیس' 'خیلی' 'داده' 'دارد' 'دارم' 'داغون' 'در' 'درد'
 'دزدی' 'دشمن' 'دل' 'دلبر' 'دلت' 'دلم' 'دلها' 'دنیا' 'دهن' 'دوست'

In [13]:
X_train.shape

(6150, 300)

# SVM

In [17]:
%%timeit

import sklearn.metrics as skm
from sklearn.svm import SVC
from sklearn.metrics import classification_report

import math
def ceiltoup(x):
  return math.ceil(x * 100) / 100.0

f1_macro = []
accuracy = []
svc_model = SVC(gamma='auto')
svc_model.fit(X_train,y_train)
y_pred = svc_model.predict(X_test)
f1_macro.append(skm.f1_score(y_test, y_pred, average="macro"))
accuracy.append(skm.accuracy_score(y_test, y_pred, normalize=True))
print('f1 macro : ' + str(f1_macro) + '\nmean f1 macro : ' + str(ceiltoup(np.mean(f1_macro))))
print('accuracy : ' + str(accuracy) + '\nmean accuracy : ' + str(ceiltoup(np.mean(accuracy))))

f1 macro : [0.408037786810857]
mean f1 macro : 0.41
accuracy : [0.40747176368375326]
mean accuracy : 0.41
f1 macro : [0.408037786810857]
mean f1 macro : 0.41
accuracy : [0.40747176368375326]
mean accuracy : 0.41
f1 macro : [0.408037786810857]
mean f1 macro : 0.41
accuracy : [0.40747176368375326]
mean accuracy : 0.41
f1 macro : [0.408037786810857]
mean f1 macro : 0.41
accuracy : [0.40747176368375326]
mean accuracy : 0.41
f1 macro : [0.408037786810857]
mean f1 macro : 0.41
accuracy : [0.40747176368375326]
mean accuracy : 0.41
f1 macro : [0.408037786810857]
mean f1 macro : 0.41
accuracy : [0.40747176368375326]
mean accuracy : 0.41
f1 macro : [0.408037786810857]
mean f1 macro : 0.41
accuracy : [0.40747176368375326]
mean accuracy : 0.41
f1 macro : [0.408037786810857]
mean f1 macro : 0.41
accuracy : [0.40747176368375326]
mean accuracy : 0.41
4.76 s ± 527 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Random Forest

In [16]:
%%timeit

import sklearn.metrics as skm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import math
def ceiltoup(x):
  return math.ceil(x * 100) / 100.0

f1_macro = []
accuracy = []

forest_model = RandomForestClassifier()
forest_model.fit(X_train,y_train)
y_pred = forest_model.predict(X_test)
f1_macro.append(skm.f1_score(y_test, y_pred, average="macro"))
accuracy.append(skm.accuracy_score(y_test, y_pred, normalize=True))
print('f1 macro : ' + str(f1_macro) + '\nmean f1 macro : ' + str(ceiltoup(np.mean(f1_macro))))
print('accuracy : ' + str(accuracy) + '\nmean accuracy : ' + str(ceiltoup(np.mean(accuracy))))

f1 macro : [0.3591944606429749]
mean f1 macro : 0.36
accuracy : [0.35881841876629017]
mean accuracy : 0.36
f1 macro : [0.3506335143803002]
mean f1 macro : 0.36
accuracy : [0.35881841876629017]
mean accuracy : 0.36
f1 macro : [0.3684566402912286]
mean f1 macro : 0.37
accuracy : [0.3640312771503041]
mean accuracy : 0.37
f1 macro : [0.3661304956647096]
mean f1 macro : 0.37
accuracy : [0.366637706342311]
mean accuracy : 0.37
f1 macro : [0.374324550971997]
mean f1 macro : 0.38
accuracy : [0.368375325803649]
mean accuracy : 0.37
f1 macro : [0.37233339906444163]
mean f1 macro : 0.38
accuracy : [0.369244135534318]
mean accuracy : 0.37
f1 macro : [0.3589217564591873]
mean f1 macro : 0.36
accuracy : [0.35881841876629017]
mean accuracy : 0.36
f1 macro : [0.37135287368252173]
mean f1 macro : 0.38
accuracy : [0.37011294526498695]
mean accuracy : 0.38
5.43 s ± 1.92 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
