In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [2]:
# Read CSV file
data = pd.read_csv('preprocessed.csv')
data = data[['Posts', 'Label']]

In [3]:
data

Unnamed: 0,Posts,Label
0,እነዚህን ወሳኝ ጉዳዮችን የሚያስፈፅም አካል እንዲቋቋምና ክትትል እንዲደረ...,Free
1,ጃዋር አልበግዳዲ በግርግር ስልጣን ለመያዝ የሚያደርገው ነገር ብዙ ስራ ፈ...,Offensive
2,ምን ሆናችሁ ኦርቶዶክሶች ሰሞኑን ፖለቲካ ገባ እንዴ ውስጣችኋይዟችሁ ወደ...,Free
3,ስለ አሳምነው የምትፅፈው ነገር አሳምነው ለአማራ የነበረው ተቆርቋሪነት አ...,Free
4,አጋጣሚውን ተጠቅመው የእስክንድር ነጋን ስም እና ትግል ለማዳከም የሚፈልጉ...,Offensive
...,...,...
5725,ኦህዴድ እራሱ ባመጣዉ ቀዉስ ተናዉጦ መዉጫ ቀዳዳ እየፈለገ ነዉ,Offensive
5726,ኦሮሞ ሀገር መምራት አይችልም,Offensive
5727,ሲማረኩ በሚሰጡት አትታለሉ ማርኮ ከማብላት መረሸን ይቀላል ትግሬ ጎረ...,Offensive
5728,በኦሮሞ የተሞሉ ሴገጤዎች የወሎ ኦሮሞን ምንም ስሌ የማያውቅን ህዝብ ...,Offensive


In [4]:
# Preprocessing and training data
sentences = data['Posts'].tolist()
labels = data['Label'].tolist()

In [6]:
# Tokenization and preprocessing
tokenized_sentences = [sentence.split() for sentence in sentences]

In [7]:
tokenized_sentences

[['እነዚህን',
  'ወሳኝ',
  'ጉዳዮችን',
  'የሚያስፈፅም',
  'አካል',
  'እንዲቋቋምና',
  'ክትትል',
  'እንዲደረግ',
  'በመግለጫው',
  'ጠይቀዋል'],
 ['ጃዋር',
  'አልበግዳዲ',
  'በግርግር',
  'ስልጣን',
  'ለመያዝ',
  'የሚያደርገው',
  'ነገር',
  'ብዙ',
  'ስራ',
  'ፈት',
  'ቄሮ',
  'ሊያስጨርስ',
  'እንጂ',
  'የስልጣን',
  'ሽታዋን',
  'አያገኝም',
  'ያባቱ',
  'ሀገር',
  'የመን',
  'ይሂድ',
  'ስልጣን',
  'ካማረው',
  'ወይ',
  'ደግሞ',
  'በናቱ',
  'አማራ',
  'ስለሆለ',
  'አማራ',
  'ክልል',
  'ሂዶ',
  'ይወዳደር'],
 ['ምን',
  'ሆናችሁ',
  'ኦርቶዶክሶች',
  'ሰሞኑን',
  'ፖለቲካ',
  'ገባ',
  'እንዴ',
  'ውስጣችኋይዟችሁ',
  'ወደዱም',
  'ጠሉም',
  'ኢዮጵያ',
  'አትፈርስም',
  'ኦርቶዶክስ',
  'አትከፈልም',
  'ይህንን',
  'ያለው',
  'ጠቅላይ',
  'ሚንስትር',
  'አቢይ',
  'አህመድ',
  'ነውበናትህ',
  'ጡጦ',
  'ግዛልን'],
 ['ስለ',
  'አሳምነው',
  'የምትፅፈው',
  'ነገር',
  'አሳምነው',
  'ለአማራ',
  'የነበረው',
  'ተቆርቋሪነት',
  'አሪፍ',
  'ነበር',
  'ነገር',
  'ግን',
  'እነደ',
  'ፀጥታ',
  'ዘርፍ',
  'ሀላፊ',
  'ሆኖ',
  'ሲሰራ',
  'የፌደራል',
  'ፓሊስም',
  'ሆነ',
  'የኦነግ',
  'ሰራዊት',
  'ወደ',
  'አማራ',
  'ክልል',
  'ከሰኞ',
  'አርብ',
  'አየገቡ',
  'መረጃው',
  'ደርሶት',
  'ነበር',
  'ለምን',
  'ዝግጅት',
  'አላደረገም',
  'የሆነ',
  'ነገር'

In [8]:
word2vec_model = KeyedVectors.load_word2vec_format('Embeddings/5w_10ng_am_w2v_cbow_300D.txt', binary=False, encoding='utf-8')

In [9]:
# Function to generate document vectors
def generate_doc_vector(sentence):
    vectors = [word2vec_model[word] for word in sentence if word in word2vec_model]
    if not vectors:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(vectors, axis=0)

In [10]:
X = [generate_doc_vector(sentence) for sentence in tokenized_sentences]

In [11]:
# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [12]:
# Initialize and train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [13]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [14]:
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Accuracy:", accuracy)

Logistic Regression Accuracy: 0.7041884816753927


In [15]:
classification_LR = classification_report(y_test, y_pred)
print("\nLRClassification Report:")
print(classification_LR)


LRClassification Report:
              precision    recall  f1-score   support

        Free       0.71      0.74      0.73       603
   Offensive       0.70      0.66      0.68       543

    accuracy                           0.70      1146
   macro avg       0.70      0.70      0.70      1146
weighted avg       0.70      0.70      0.70      1146



In [17]:
conf_matrix_LR = confusion_matrix(y_test, y_pred)
print("\nLRConfusion Matrix:")
print(conf_matrix_LR)


LRConfusion Matrix:
[[447 156]
 [183 360]]


In [18]:
# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [19]:
rf_predictions = rf_model.predict(X_test)

In [20]:
# Evaluate accuracy
accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", accuracy)

Random Forest Accuracy: 0.7268760907504364


In [21]:
classification_RF = classification_report(y_test, rf_predictions)
print("\nRFClassification Report:")
print(classification_RF)


RFClassification Report:
              precision    recall  f1-score   support

        Free       0.72      0.78      0.75       603
   Offensive       0.73      0.67      0.70       543

    accuracy                           0.73      1146
   macro avg       0.73      0.72      0.72      1146
weighted avg       0.73      0.73      0.73      1146



In [22]:
conf_matrix_RF = confusion_matrix(y_test, rf_predictions)
print("\nRFConfusion Matrix:")
print(conf_matrix_RF)


RFConfusion Matrix:
[[471 132]
 [181 362]]


In [23]:
# Support Vector Machine (SVM) Classifier
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

SVC(kernel='linear')

In [24]:
svm_predictions = svm_model.predict(X_test)

In [25]:
# Evaluate accuracy
accuracy = accuracy_score(y_test, svm_predictions)
print("SVM Accuracy:", accuracy)

SVM Accuracy: 0.7207678883071553


In [26]:
classification_SVM = classification_report(y_test, svm_predictions)
print("\nSVMClassification Report:")
print(classification_SVM)


SVMClassification Report:
              precision    recall  f1-score   support

        Free       0.74      0.73      0.73       603
   Offensive       0.70      0.71      0.71       543

    accuracy                           0.72      1146
   macro avg       0.72      0.72      0.72      1146
weighted avg       0.72      0.72      0.72      1146



In [27]:
conf_matrix_SVM = confusion_matrix(y_test, svm_predictions)
print("\nSVMConfusion Matrix:")
print(conf_matrix_SVM)


SVMConfusion Matrix:
[[438 165]
 [155 388]]
