# TFIDF - SVM

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# Import the dataset

In [3]:
emotion_dataset_train = pd.read_excel('drive/MyDrive/ShortPersianEmo 2023 Code/Data/ShortPersianEmo/train_fa.xlsx', header=None)
x_train = emotion_dataset_train[0]
y_train = emotion_dataset_train[1]

emotion_dataset_test = pd.read_excel('drive/MyDrive/ShortPersianEmo 2023 Code/Data/ShortPersianEmo/test_fa.xlsx', header=None)
x_test = emotion_dataset_test[0]
y_test = emotion_dataset_test[1]

In [4]:
x_train.shape

(4924,)

In [5]:
x_test.shape

(548,)

In [6]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(pd.concat([y_train, y_test]))
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [7]:
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

{'ANGRY': 0, 'FEAR': 1, 'HAPPY': 2, 'OTHER': 3, 'SAD': 4}


# Extract TFIDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = TfidfVectorizer()
# vectorizer = CountVectorizer()

vectorizer.fit(x_train)

X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)

feature_names = vectorizer.get_feature_names_out()

print(vectorizer.get_feature_names_out())

print(X_train.shape)

['00' '10' '100' ... '۹۸' '۹۹' 'ᴇxᴏᴇɴᴛʏ']
(4924, 10385)


# Scale Data

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
X_train.shape

(4924, 10385)

# Feature Selection

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

selector = SelectKBest(f_classif, k=300)
selector.fit(X_train, y_train)
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)


In [12]:
print(feature_names[selector.get_support()])

['hero' 'آذربايجان' 'آلارم' 'آن' 'اتاق' 'احتمال' 'ارزش' 'اروپا' 'از'
 'ازدحام' 'ازش' 'استانبول' 'استرس' 'استفاده' 'اشک' 'اشکم' 'اصلا' 'اطلاع'
 'اعتراضات_سراسری_تلاویو' 'اعتصابات_سراسری' 'افتضاح' 'افغانستان' 'العاده'
 'ام' 'اما' 'امشب' 'امیدوارم' 'انگار' 'انگیز' 'او' 'اومد' 'اومده' 'اونجا'
 'اوکراین' 'اینترنت' 'باخت' 'بالایی' 'باید' 'بخرید' 'بخرین' 'بخواهد' 'بد'
 'بدرد' 'بده' 'برای' 'برای_سرباز' 'بزنید' 'بسیار' 'بشه' 'بعد' 'بغض' 'بم'
 'بمیرم' 'بمیریم' 'بهترین' 'بورس' 'بوی' 'بگو' 'بی' 'بیاد' 'بیفتم' 'تا'
 'تجمع' 'ترس' 'ترسم' 'ترسناک' 'تروریست' 'ترکیه' 'تنها' 'تنهای' 'تنهایی'
 'تهران' 'تو' 'تپش' 'جاش' 'جان' 'جانفدا' 'جز' 'جن' 'جنس' 'جونم' 'حاج'
 'حافظ' 'حال' 'حتما' 'حتی' 'حرف' 'حس' 'حسرت' 'خبر' 'خراب' 'خرید' 'خریدش'
 'خریدم' 'خسارت' 'خسته' 'خواب' 'خوب' 'خوبه' 'خوبی' 'خوبیه' 'خودت' 'خوش'
 'خوی' 'خیال' 'خیلی' 'دارم' 'داره' 'داستان' 'داعش' 'در' 'درد' 'دقایقی'
 'دل' 'دلتنگی' 'دلم' 'دنده' 'دنیای' 'دور' 'دوست' 'دی' 'دیجی' 'دیگه' 'را'
 'راحت' 'راضی' 'راضیم' 'رد' 'رسید' 'رنگ' 'رو' 'روسیه' 'ریخت' 

In [13]:
X_train.shape

(4924, 300)

# SVM

In [14]:
%%timeit

import sklearn.metrics as skm
from sklearn.svm import SVC
from sklearn.metrics import classification_report

import math
def ceiltoup(x):
  return math.ceil(x * 100) / 100.0

f1_macro = []
accuracy = []
svc_model = SVC(gamma='auto')
svc_model.fit(X_train,y_train)
y_pred = svc_model.predict(X_test)
f1_macro.append(skm.f1_score(y_test, y_pred, average="macro"))
accuracy.append(skm.accuracy_score(y_test, y_pred, normalize=True))
print('f1 macro : ' + str(f1_macro) + '\nmean f1 macro : ' + str(ceiltoup(np.mean(f1_macro))))
print('accuracy : ' + str(accuracy) + '\nmean accuracy : ' + str(ceiltoup(np.mean(accuracy))))

f1 macro : [0.5788911458284811]
mean f1 macro : 0.58
accuracy : [0.6259124087591241]
mean accuracy : 0.63
f1 macro : [0.5788911458284811]
mean f1 macro : 0.58
accuracy : [0.6259124087591241]
mean accuracy : 0.63
f1 macro : [0.5788911458284811]
mean f1 macro : 0.58
accuracy : [0.6259124087591241]
mean accuracy : 0.63
f1 macro : [0.5788911458284811]
mean f1 macro : 0.58
accuracy : [0.6259124087591241]
mean accuracy : 0.63
f1 macro : [0.5788911458284811]
mean f1 macro : 0.58
accuracy : [0.6259124087591241]
mean accuracy : 0.63
f1 macro : [0.5788911458284811]
mean f1 macro : 0.58
accuracy : [0.6259124087591241]
mean accuracy : 0.63
f1 macro : [0.5788911458284811]
mean f1 macro : 0.58
accuracy : [0.6259124087591241]
mean accuracy : 0.63
f1 macro : [0.5788911458284811]
mean f1 macro : 0.58
accuracy : [0.6259124087591241]
mean accuracy : 0.63
2.07 s ± 271 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Random Forest

In [15]:
%%timeit

import sklearn.metrics as skm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import math
def ceiltoup(x):
  return math.ceil(x * 100) / 100.0

f1_macro = []
accuracy = []
forest_model = RandomForestClassifier()
forest_model.fit(X_train,y_train)
y_pred = forest_model.predict(X_test)
f1_macro.append(skm.f1_score(y_test, y_pred, average="macro"))
accuracy.append(skm.accuracy_score(y_test, y_pred, normalize=True))
print('f1 macro : ' + str(f1_macro) + '\nmean f1 macro : ' + str(ceiltoup(np.mean(f1_macro))))
print('accuracy : ' + str(accuracy) + '\nmean accuracy : ' + str(ceiltoup(np.mean(accuracy))))

f1 macro : [0.5461390049716105]
mean f1 macro : 0.55
accuracy : [0.5912408759124088]
mean accuracy : 0.6
f1 macro : [0.5587012987012987]
mean f1 macro : 0.56
accuracy : [0.6003649635036497]
mean accuracy : 0.61
f1 macro : [0.5437364466488765]
mean f1 macro : 0.55
accuracy : [0.583941605839416]
mean accuracy : 0.59
f1 macro : [0.5410168752049036]
mean f1 macro : 0.55
accuracy : [0.5857664233576643]
mean accuracy : 0.59
f1 macro : [0.5555494424562188]
mean f1 macro : 0.56
accuracy : [0.5912408759124088]
mean accuracy : 0.6
f1 macro : [0.5456913318236843]
mean f1 macro : 0.55
accuracy : [0.5894160583941606]
mean accuracy : 0.59
f1 macro : [0.5349403252265724]
mean f1 macro : 0.54
accuracy : [0.5784671532846716]
mean accuracy : 0.58
f1 macro : [0.5540079200850144]
mean f1 macro : 0.56
accuracy : [0.593065693430657]
mean accuracy : 0.6
3.09 s ± 748 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
