# Persian Stance Classification - Shallow Learning

In [None]:
# input files
cleaned_path = "/content/drive/MyDrive/Stance Detection Project/dataset cleaned/Clean_Claim_Body.csv"

In [None]:
num_features = 300

## Install Packages and Import

In [None]:
# Import required packages
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## Mount Google Drive

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Read Cleaned Data from CSV File

In [None]:
dataset_clean = pd.read_csv(cleaned_path, index_col = 0, )

In [None]:
dataset_clean.head()

Unnamed: 0,claim,body,label
0,کلاهبرداری از رانندگان با شگرد نشت بنزین !,به گزارش خبرنگار گروه جامعه خبرگزاری میزان،29 ...,Discuss
1,تجاوز به دختر بازداشت شده و واژگونی ون گشت ارش...,انتشار کلیپ واژگونی ماشین گشت ارشاد توسط مردم ...,Discuss
2,تعظیم 20 دقیقه ای وزیر نیرو ژاپن به علت قطع بر...,وزیر نیروی ژاپن به علت قطع شدن برق؛ به همان مد...,Agree
3,سرمربیگری گاس هیدینک برای تراکتورسازی,به تازگی محمد تقوی استعفای خود را از سرمربیگری...,Discuss
4,کشف موجود عجیبی شبیه انسان در یک حفاری در پاکس...,پس از 20 سال حفاری با دقتی باورنکردنی، سرانجام...,Unrelated


## Load Features

In [None]:
dataset_clean['concatenated'] = dataset_clean['claim'] + " . " + dataset_clean['body']

X_BERT = dataset_clean['concatenated']
y_BERT = dataset_clean['label']

In [None]:
X_BERT.head()

0    کلاهبرداری از رانندگان با شگرد نشت بنزین ! . ب...
1    تجاوز به دختر بازداشت شده و واژگونی ون گشت ارش...
2    تعظیم 20 دقیقه ای وزیر نیرو ژاپن به علت قطع بر...
3    سرمربیگری گاس هیدینک برای تراکتورسازی . به تاز...
4    کشف موجود عجیبی شبیه انسان در یک حفاری در پاکس...
Name: concatenated, dtype: object

In [None]:
y_BERT.head()

0      Discuss
1      Discuss
2        Agree
3      Discuss
4    Unrelated
Name: label, dtype: object

In [None]:
label2id = {'Agree': 0, 'Disagree': 1, 'Discuss': 2, 'Unrelated': 3}
id2label = {0: 'Agree', 1: 'Disagree', 2: 'Discuss', 3: 'Unrelated'}

y_BERT = y_BERT.map(label2id)

y_BERT.head()

0    2
1    2
2    0
3    2
4    3
Name: label, dtype: int64

## Vectorization

In [None]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
x_bert = vectorizer.fit_transform(X_BERT)
vectorizer.get_feature_names_out()

print( x_bert.toarray() )

y_bert = y_BERT

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
CPU times: user 1.44 s, sys: 499 ms, total: 1.94 s
Wall time: 2.6 s


## Feature Selection

In [None]:
%%time

from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

# k = 4 tells four top features to be selected
selector = SelectKBest(score_func = f_classif, k = num_features)
fit = selector.fit(x_bert, y_BERT)
fit.scores_

x_bert = selector.fit_transform(x_bert, y_BERT)

CPU times: user 135 ms, sys: 8.31 ms, total: 143 ms
Wall time: 279 ms


## Train and Test Split

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_bert, y_bert, test_size=0.2, random_state=42)

## Support Vector Machines (SVM)

In [None]:
%%time

from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

model = SVC(decision_function_shape='ovo')

model.fit(x_train,y_train)

y_pred = model.predict(x_train)
accuracy_train = accuracy_score(y_train, y_pred)
print(accuracy_train)

y_pred = model.predict(x_test)
accuracy_test = accuracy_score(y_test, y_pred)
print(accuracy_test)


0.7626800250469631
0.6475
CPU times: user 969 ms, sys: 4.63 ms, total: 974 ms
Wall time: 1.33 s


In [None]:
y_pred_pandas = pd.Series(y_pred)
y_test_pandas = pd.Series(y_test)

y_pred_pandas = y_pred_pandas.map(id2label)
y_test_pandas = y_test_pandas.map(id2label)

print('SVM \n\n')

print(classification_report(y_test_pandas, y_pred_pandas, target_names=id2label.values()))

SVM 


              precision    recall  f1-score   support

       Agree       1.00      0.19      0.31        27
    Disagree       0.82      0.67      0.74        42
     Discuss       0.63      0.81      0.71       195
   Unrelated       0.62      0.51      0.56       136

    accuracy                           0.65       400
   macro avg       0.77      0.54      0.58       400
weighted avg       0.67      0.65      0.63       400



## Random Forest

In [None]:
%%time

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 50, max_depth=10)

model.fit(x_train,y_train)

y_pred = model.predict(x_train)
accuracy_train = accuracy_score(y_train, y_pred)
print(accuracy_train)

y_pred = model.predict(x_test)
accuracy_test = accuracy_score(y_test, y_pred)
print(accuracy_test)


0.7194740137758296
0.5725
CPU times: user 310 ms, sys: 2.99 ms, total: 313 ms
Wall time: 366 ms


In [None]:
y_pred_pandas = pd.Series(y_pred)
y_test_pandas = pd.Series(y_test)

y_pred_pandas = y_pred_pandas.map(id2label)
y_test_pandas = y_test_pandas.map(id2label)

print('Random Forest \n\n')

print(classification_report(y_test_pandas, y_pred_pandas, target_names=id2label.values()))

Random Forest 


              precision    recall  f1-score   support

       Agree       0.00      0.00      0.00        27
    Disagree       1.00      0.43      0.60        42
     Discuss       0.55      0.91      0.68       195
   Unrelated       0.59      0.25      0.35       136

    accuracy                           0.57       400
   macro avg       0.53      0.40      0.41       400
weighted avg       0.57      0.57      0.51       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## KNN

In [None]:
%%time

from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors = 5)

model.fit(x_train,y_train)

y_pred = model.predict(x_train)
accuracy_train = accuracy_score(y_train, y_pred)
print(accuracy_train)

y_pred = model.predict(x_test)
accuracy_test = accuracy_score(y_test, y_pred)
print(accuracy_test)


0.7169693174702567
0.5375
CPU times: user 1.54 s, sys: 18.7 ms, total: 1.56 s
Wall time: 2.01 s


In [None]:
y_pred_pandas = pd.Series(y_pred)
y_test_pandas = pd.Series(y_test)

y_pred_pandas = y_pred_pandas.map(id2label)
y_test_pandas = y_test_pandas.map(id2label)

print('KNN  \n\n')

print(classification_report(y_test_pandas, y_pred_pandas, target_names=id2label.values()))

KNN  


              precision    recall  f1-score   support

       Agree       0.50      0.11      0.18        27
    Disagree       0.72      0.31      0.43        42
     Discuss       0.58      0.67      0.62       195
   Unrelated       0.45      0.50      0.48       136

    accuracy                           0.54       400
   macro avg       0.56      0.40      0.43       400
weighted avg       0.55      0.54      0.52       400



## Logistic Regression

In [None]:
%%time

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

model.fit(x_train,y_train)

y_pred = model.predict(x_train)
accuracy_train = accuracy_score(y_train, y_pred)
print(accuracy_train)

y_pred = model.predict(x_test)
accuracy_test = accuracy_score(y_test, y_pred)
print(accuracy_test)


0.5723231058234189
0.51
CPU times: user 95.5 ms, sys: 0 ns, total: 95.5 ms
Wall time: 244 ms


In [None]:
y_pred_pandas = pd.Series(y_pred)
y_test_pandas = pd.Series(y_test)

y_pred_pandas = y_pred_pandas.map(id2label)
y_test_pandas = y_test_pandas.map(id2label)

print('Multinomial LogisticRegression  \n\n')

print(classification_report(y_test_pandas, y_pred_pandas, target_names=id2label.values()))

Multinomial LogisticRegression  


              precision    recall  f1-score   support

       Agree       0.00      0.00      0.00        27
    Disagree       1.00      0.17      0.29        42
     Discuss       0.50      1.00      0.67       195
   Unrelated       0.67      0.01      0.03       136

    accuracy                           0.51       400
   macro avg       0.54      0.30      0.25       400
weighted avg       0.58      0.51      0.36       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Results and Log

In [None]:
# Execute #1:

# SVM


#               precision    recall  f1-score   support

#        Agree       1.00      0.19      0.31        27
#     Disagree       0.82      0.67      0.74        42
#      Discuss       0.63      0.81      0.71       195
#    Unrelated       0.62      0.51      0.56       136

#     accuracy                           0.65       400
#    macro avg       0.77      0.54      0.58       400
# weighted avg       0.67      0.65      0.63       400


# Random Forest


#               precision    recall  f1-score   support

#        Agree       0.00      0.00      0.00        27
#     Disagree       1.00      0.33      0.50        42
#      Discuss       0.52      0.92      0.67       195
#    Unrelated       0.51      0.15      0.24       136

#     accuracy                           0.54       400
#    macro avg       0.51      0.35      0.35       400
# weighted avg       0.53      0.54      0.46       400


# KNN


#               precision    recall  f1-score   support

#        Agree       0.50      0.11      0.18        27
#     Disagree       0.72      0.31      0.43        42
#      Discuss       0.58      0.67      0.62       195
#    Unrelated       0.45      0.50      0.48       136

#     accuracy                           0.54       400
#    macro avg       0.56      0.40      0.43       400
# weighted avg       0.55      0.54      0.52       400


# Multinomial LogisticRegression


#               precision    recall  f1-score   support

#        Agree       0.00      0.00      0.00        27
#     Disagree       1.00      0.17      0.29        42
#      Discuss       0.50      1.00      0.67       195
#    Unrelated       0.67      0.01      0.03       136

#     accuracy                           0.51       400
#    macro avg       0.54      0.30      0.25       400
# weighted avg       0.58      0.51      0.36       400



In [None]:
# Execute #2:

# SVM


#               precision    recall  f1-score   support

#        Agree       1.00      0.19      0.31        27
#     Disagree       0.82      0.67      0.74        42
#      Discuss       0.63      0.81      0.71       195
#    Unrelated       0.62      0.51      0.56       136

#     accuracy                           0.65       400
#    macro avg       0.77      0.54      0.58       400
# weighted avg       0.67      0.65      0.63       400


# Random Forest


#               precision    recall  f1-score   support

#        Agree       0.00      0.00      0.00        27
#     Disagree       1.00      0.40      0.58        42
#      Discuss       0.55      0.91      0.69       195
#    Unrelated       0.57      0.25      0.35       136

#     accuracy                           0.57       400
#    macro avg       0.53      0.39      0.40       400
# weighted avg       0.57      0.57      0.51       400


# KNN


#               precision    recall  f1-score   support

#        Agree       0.50      0.11      0.18        27
#     Disagree       0.72      0.31      0.43        42
#      Discuss       0.58      0.67      0.62       195
#    Unrelated       0.45      0.50      0.48       136

#     accuracy                           0.54       400
#    macro avg       0.56      0.40      0.43       400
# weighted avg       0.55      0.54      0.52       400


# Multinomial LogisticRegression


#               precision    recall  f1-score   support

#        Agree       0.00      0.00      0.00        27
#     Disagree       1.00      0.17      0.29        42
#      Discuss       0.50      1.00      0.67       195
#    Unrelated       0.67      0.01      0.03       136

#     accuracy                           0.51       400
#    macro avg       0.54      0.30      0.25       400
# weighted avg       0.58      0.51      0.36       400


In [None]:
# Execute #3:

# SVM


#               precision    recall  f1-score   support

#        Agree       1.00      0.19      0.31        27
#     Disagree       0.82      0.67      0.74        42
#      Discuss       0.63      0.81      0.71       195
#    Unrelated       0.62      0.51      0.56       136

#     accuracy                           0.65       400
#    macro avg       0.77      0.54      0.58       400
# weighted avg       0.67      0.65      0.63       400


# Random Forest


#               precision    recall  f1-score   support

#        Agree       0.00      0.00      0.00        27
#     Disagree       1.00      0.43      0.60        42
#      Discuss       0.54      0.93      0.68       195
#    Unrelated       0.56      0.18      0.28       136

#     accuracy                           0.56       400
#    macro avg       0.52      0.39      0.39       400
# weighted avg       0.56      0.56      0.49       400


# KNN


#               precision    recall  f1-score   support

#        Agree       0.50      0.11      0.18        27
#     Disagree       0.72      0.31      0.43        42
#      Discuss       0.58      0.67      0.62       195
#    Unrelated       0.45      0.50      0.48       136

#     accuracy                           0.54       400
#    macro avg       0.56      0.40      0.43       400
# weighted avg       0.55      0.54      0.52       400


# Multinomial LogisticRegression


#               precision    recall  f1-score   support

#        Agree       0.00      0.00      0.00        27
#     Disagree       1.00      0.17      0.29        42
#      Discuss       0.50      1.00      0.67       195
#    Unrelated       0.67      0.01      0.03       136

#     accuracy                           0.51       400
#    macro avg       0.54      0.30      0.25       400
# weighted avg       0.58      0.51      0.36       400




In [None]:
# Execute #4:

# SVM


#               precision    recall  f1-score   support

#        Agree       1.00      0.19      0.31        27
#     Disagree       0.82      0.67      0.74        42
#      Discuss       0.63      0.81      0.71       195
#    Unrelated       0.62      0.51      0.56       136

#     accuracy                           0.65       400
#    macro avg       0.77      0.54      0.58       400
# weighted avg       0.67      0.65      0.63       400


# Random Forest


#               precision    recall  f1-score   support

#        Agree       0.00      0.00      0.00        27
#     Disagree       1.00      0.40      0.58        42
#      Discuss       0.53      0.90      0.67       195
#    Unrelated       0.49      0.19      0.28       136

#     accuracy                           0.55       400
#    macro avg       0.51      0.37      0.38       400
# weighted avg       0.53      0.55      0.48       400


# KNN


#               precision    recall  f1-score   support

#        Agree       0.50      0.11      0.18        27
#     Disagree       0.72      0.31      0.43        42
#      Discuss       0.58      0.67      0.62       195
#    Unrelated       0.45      0.50      0.48       136

#     accuracy                           0.54       400
#    macro avg       0.56      0.40      0.43       400
# weighted avg       0.55      0.54      0.52       400


# Multinomial LogisticRegression


#               precision    recall  f1-score   support

#        Agree       0.00      0.00      0.00        27
#     Disagree       1.00      0.17      0.29        42
#      Discuss       0.50      1.00      0.67       195
#    Unrelated       0.67      0.01      0.03       136

#     accuracy                           0.51       400
#    macro avg       0.54      0.30      0.25       400
# weighted avg       0.58      0.51      0.36       400




In [None]:
# Execute #5:

# SVM


#               precision    recall  f1-score   support

#        Agree       1.00      0.19      0.31        27
#     Disagree       0.82      0.67      0.74        42
#      Discuss       0.63      0.81      0.71       195
#    Unrelated       0.62      0.51      0.56       136

#     accuracy                           0.65       400
#    macro avg       0.77      0.54      0.58       400
# weighted avg       0.67      0.65      0.63       400


# Random Forest


#               precision    recall  f1-score   support

#        Agree       0.00      0.00      0.00        27
#     Disagree       1.00      0.43      0.60        42
#      Discuss       0.55      0.91      0.68       195
#    Unrelated       0.59      0.25      0.35       136

#     accuracy                           0.57       400
#    macro avg       0.53      0.40      0.41       400
# weighted avg       0.57      0.57      0.51       400


# KNN


#               precision    recall  f1-score   support

#        Agree       0.50      0.11      0.18        27
#     Disagree       0.72      0.31      0.43        42
#      Discuss       0.58      0.67      0.62       195
#    Unrelated       0.45      0.50      0.48       136

#     accuracy                           0.54       400
#    macro avg       0.56      0.40      0.43       400
# weighted avg       0.55      0.54      0.52       400


# Multinomial LogisticRegression


#               precision    recall  f1-score   support

#        Agree       0.00      0.00      0.00        27
#     Disagree       1.00      0.17      0.29        42
#      Discuss       0.50      1.00      0.67       195
#    Unrelated       0.67      0.01      0.03       136

#     accuracy                           0.51       400
#    macro avg       0.54      0.30      0.25       400
# weighted avg       0.58      0.51      0.36       400

