# TFIDF - SVM

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# Import the dataset

In [None]:
emotion_dataset_train = pd.read_excel('drive/MyDrive/ShortPersianEmo 2023 Code/Data/ShortPersianEmo/train_fa.xlsx', header=None)
x_train = emotion_dataset_train[0]
y_train = emotion_dataset_train[1]

emotion_dataset_test = pd.read_excel('drive/MyDrive/ShortPersianEmo 2023 Code/Data/ShortPersianEmo/test_fa.xlsx', header=None)
x_test = emotion_dataset_test[0]
y_test = emotion_dataset_test[1]

In [None]:
x_train.shape

(4924,)

In [None]:
x_test.shape

(548,)

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(pd.concat([y_train, y_test]))
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [None]:
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

{'ANGRY': 0, 'FEAR': 1, 'HAPPY': 2, 'OTHER': 3, 'SAD': 4}


# Extract TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = TfidfVectorizer()
# vectorizer = CountVectorizer()

vectorizer.fit(x_train)

X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)

feature_names = vectorizer.get_feature_names_out()

print(vectorizer.get_feature_names_out())

print(X_train.shape)

['00' '10' '100' ... '۹۸' '۹۹' 'ᴇxᴏᴇɴᴛʏ']
(4924, 10385)


# Scale Data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train.shape

(4924, 10385)

# Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

selector = SelectKBest(f_classif, k=300)
selector.fit(X_train, y_train)
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)


In [None]:
print(feature_names[selector.get_support()])

['hero' 'آذربايجان' 'آلارم' 'آن' 'اتاق' 'احتمال' 'ارزش' 'اروپا' 'از'
 'ازدحام' 'ازش' 'استانبول' 'استرس' 'استفاده' 'اشک' 'اشکم' 'اصلا' 'اطلاع'
 'اعتراضات_سراسری_تلاویو' 'اعتصابات_سراسری' 'افتضاح' 'افغانستان' 'العاده'
 'ام' 'اما' 'امشب' 'امیدوارم' 'انگار' 'انگیز' 'او' 'اومد' 'اومده' 'اونجا'
 'اوکراین' 'اینترنت' 'باخت' 'بالایی' 'باید' 'بخرید' 'بخرین' 'بخواهد' 'بد'
 'بدرد' 'بده' 'برای' 'برای_سرباز' 'بزنید' 'بسیار' 'بشه' 'بعد' 'بغض' 'بم'
 'بمیرم' 'بمیریم' 'بهترین' 'بورس' 'بوی' 'بگو' 'بی' 'بیاد' 'بیفتم' 'تا'
 'تجمع' 'ترس' 'ترسم' 'ترسناک' 'تروریست' 'ترکیه' 'تنها' 'تنهای' 'تنهایی'
 'تهران' 'تو' 'تپش' 'جاش' 'جان' 'جانفدا' 'جز' 'جن' 'جنس' 'جونم' 'حاج'
 'حافظ' 'حال' 'حتما' 'حتی' 'حرف' 'حس' 'حسرت' 'خبر' 'خراب' 'خرید' 'خریدش'
 'خریدم' 'خسارت' 'خسته' 'خواب' 'خوب' 'خوبه' 'خوبی' 'خوبیه' 'خودت' 'خوش'
 'خوی' 'خیال' 'خیلی' 'دارم' 'داره' 'داستان' 'داعش' 'در' 'درد' 'دقایقی'
 'دل' 'دلتنگی' 'دلم' 'دنده' 'دنیای' 'دور' 'دوست' 'دی' 'دیجی' 'دیگه' 'را'
 'راحت' 'راضی' 'راضیم' 'رد' 'رسید' 'رنگ' 'رو' 'روسیه' 'ریخت' 

In [None]:
X_train.shape

(4924, 300)

# SVM

In [None]:
%%timeit

import sklearn.metrics as skm
from sklearn.svm import SVC
from sklearn.metrics import classification_report

import math
def ceiltoup(x):
  return math.ceil(x * 100) / 100.0

f1_macro = []
accuracy = []
for i in range(10):
  svc_model = SVC(gamma='auto')
  svc_model.fit(X_train,y_train)
  y_pred = svc_model.predict(X_test)
  f1_macro.append(skm.f1_score(y_test, y_pred, average="macro"))
  accuracy.append(skm.accuracy_score(y_test, y_pred, normalize=True))
print('f1 macro : ' + str(f1_macro) + '\nmean f1 macro : ' + str(ceiltoup(np.mean(f1_macro))))
print('accuracy : ' + str(accuracy) + '\nmean accuracy : ' + str(ceiltoup(np.mean(accuracy))))

f1 macro : [0.5788911458284811, 0.5788911458284811, 0.5788911458284811, 0.5788911458284811, 0.5788911458284811, 0.5788911458284811, 0.5788911458284811, 0.5788911458284811, 0.5788911458284811, 0.5788911458284811]
mean f1 macro : 0.58
accuracy : [0.6259124087591241, 0.6259124087591241, 0.6259124087591241, 0.6259124087591241, 0.6259124087591241, 0.6259124087591241, 0.6259124087591241, 0.6259124087591241, 0.6259124087591241, 0.6259124087591241]
mean accuracy : 0.63


# Random Forest

In [None]:
%%timeit

import sklearn.metrics as skm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import math
def ceiltoup(x):
  return math.ceil(x * 100) / 100.0

f1_macro = []
accuracy = []
for i in range(10):
  forest_model = RandomForestClassifier()
  forest_model.fit(X_train,y_train)
  y_pred = forest_model.predict(X_test)
  f1_macro.append(skm.f1_score(y_test, y_pred, average="macro"))
  accuracy.append(skm.accuracy_score(y_test, y_pred, normalize=True))
print('f1 macro : ' + str(f1_macro) + '\nmean f1 macro : ' + str(ceiltoup(np.mean(f1_macro))))
print('accuracy : ' + str(accuracy) + '\nmean accuracy : ' + str(ceiltoup(np.mean(accuracy))))

f1 macro : [0.5475562515181308, 0.5430844089280011, 0.5521008950989976, 0.5494913460364967, 0.54221211112464, 0.5386218056378912, 0.5383254952348111, 0.5286964524540718, 0.5338842050762199, 0.5332865961679107]
mean f1 macro : 0.55
accuracy : [0.5985401459854015, 0.5857664233576643, 0.5967153284671532, 0.5875912408759124, 0.5821167883211679, 0.5784671532846716, 0.583941605839416, 0.5821167883211679, 0.5894160583941606, 0.572992700729927]
mean accuracy : 0.59
