In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from random import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix
from nltk.stem.isri import ISRIStemmer
from textblob import TextBlob
import re
import Levenshtein
import cupy as cp
from sklearn.metrics import roc_auc_score

<h1>Retrive the data</h1>

In [2]:
path = r"C:\Users\yazan\Documents\NAJAH\NLP\HWS\HW1\tweet_sentiment.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Tweet,Sentiment,Sentiment_Expression
0,"""أنا أؤمن بأن الانسان ينطفئ جماله عند ابتعاد م...",negative,implicit
1,من الذاكره... @3FInQe . عندما اعتقد كريستيانو ...,positive,explicit
2,لا نخلو من ضغوطات الحياة. فنحن نعيش على أرض أع...,neutral,none
3,#مصطلحات_لبنانيه_حيرت_البشريه بتوصل عالبيت ، ب...,negative,explicit
4,نصمت !! لتسير حياتنا على مً يرام فالناّس لم تع...,negative,explicit


<h1>Reduce the classes</h1>

In [3]:
print(df['Sentiment'].unique())
df['Sentiment'] = df['Sentiment'].replace('very_positive','positive')
df['Sentiment'] = df['Sentiment'].replace('very_negative','negative')
print(df['Sentiment'].unique())
df = df.drop('Sentiment_Expression', axis=1)
df.head()

['negative' 'positive' 'neutral' 'very_positive' 'very_negative']
['negative' 'positive' 'neutral']


Unnamed: 0,Tweet,Sentiment
0,"""أنا أؤمن بأن الانسان ينطفئ جماله عند ابتعاد م...",negative
1,من الذاكره... @3FInQe . عندما اعتقد كريستيانو ...,positive
2,لا نخلو من ضغوطات الحياة. فنحن نعيش على أرض أع...,neutral
3,#مصطلحات_لبنانيه_حيرت_البشريه بتوصل عالبيت ، ب...,negative
4,نصمت !! لتسير حياتنا على مً يرام فالناّس لم تع...,negative


In [4]:
df['Sentiment'].value_counts()

negative    1883
positive    1232
neutral      885
Name: Sentiment, dtype: int64

<h1>Check for null</h1>

In [5]:
df.isnull().any(axis=0)

Tweet        False
Sentiment    False
dtype: bool

# remove punctuations

In [6]:
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = '!"$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
punctuations_list = arabic_punctuations + english_punctuations

def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

# remove repeating charachters

In [7]:
def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

# stemming ,change all urls into same feature, remove numbers apply all functions and tokenization

In [8]:
stop_words=stopwords.words('arabic')
stemmerar = ISRIStemmer()
semmed_stop_words = [stemmerar.stem(word) for word in stop_words]
def textblob_tokenizer(str_input):
    str_no_url = re.sub(r'(https?://(?:www\.)?\S+)', '#رابط#', str_input)
    str_no_punctuations = remove_punctuations(str_no_url)
    str_no_number = re.sub(r'[a-zA-Z1-9]', '', str_no_punctuations)
    stemmed_word = stemmerar.stem(str_no_number)
    tokens = stemmed_word.split()
    words = []
    token_temp = ''
    for token in tokens:
        token_temp = remove_repeating_char(token)
        if token_temp not in stop_words and token_temp not in semmed_stop_words:
            words.append(token_temp)

    return words

# splitting data into test and training and building the matrix

In [9]:
%%time
x = df['Tweet']
y = df['Sentiment']
x, x_test, y, y_test = train_test_split(x,y, stratify=y, test_size=0.3)

vec = CountVectorizer(tokenizer=textblob_tokenizer)

x = vec.fit_transform(x)
x_test = vec.transform(x_test)


Wall time: 2.22 s


# building the model

In [10]:
%%time
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

model.fit(x, y)

Wall time: 12 ms


MultinomialNB()

# test the model and print the model accurices

In [11]:
%%time
from sklearn.metrics import classification_report
print(model.score(x_test, y_test))
y_pred = model.predict(x_test)
cf_matrix = confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))
print("matrix: ",cf_matrix)
y_prob = model.predict_proba(x_test)
auc = roc_auc_score(y_test, y_prob, multi_class='ovo')
print("AUC:", auc)

0.705
              precision    recall  f1-score   support

    negative       0.73      0.83      0.78       565
     neutral       0.60      0.49      0.54       265
    positive       0.73      0.66      0.69       370

    accuracy                           0.70      1200
   macro avg       0.68      0.66      0.67      1200
weighted avg       0.70      0.70      0.70      1200

matrix:  [[470  50  45]
 [ 87 130  48]
 [ 86  38 246]]
AUC: 0.8321617710547415
Wall time: 34 ms
