In [200]:
import pandas as pd 
import numpy as np
import nltk
import random
import re
from bs4 import BeautifulSoup
import emoji

nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Yakina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Yakina\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yakina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [201]:
data = pd.read_csv('datasets/twitter_training.csv')
data

Unnamed: 0,id,company,sentiment,text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [202]:
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [203]:
text = pd.DataFrame()
text['text'] = data['text']
text['sentiment'] = data['sentiment']
text

Unnamed: 0,text,sentiment
0,I mentioned on Facebook that I was struggling ...,Irrelevant
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral
2,@Microsoft Why do I pay for WORD when it funct...,Negative
3,"CSGO matchmaking is so full of closet hacking,...",Negative
4,Now the President is slapping Americans in the...,Neutral
...,...,...
995,⭐️ Toronto is the arts and culture capital of ...,Irrelevant
996,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,Irrelevant
997,Today sucked so it’s time to drink wine n play...,Positive
998,Bought a fraction of Microsoft today. Small wins.,Positive


Augmentation


In [204]:
from nltk.corpus import wordnet

def synonym_replacement(sentence, n=1):
    words = sentence.split()
    new_sentence = words.copy()
    
    for _ in range(n):
        word_idx = random.randint(0, len(words) - 1)
        synonyms = wordnet.synsets(words[word_idx])
        
        if synonyms:
            new_sentence[word_idx] = synonyms[0].lemmas()[0].name()
    
    return " ".join(new_sentence)

text['text2']=text["text"].apply(lambda x: synonym_replacement(x))
(text['text']==text['text2']).value_counts()

False    550
True     450
Name: count, dtype: int64

In [205]:
text.drop('text2', axis=1, inplace=True)

Removing stopwords, punctuation, HTML tags, emojis

In [206]:
from nltk.corpus import stopwords

stop_words= set(stopwords.words('english'))

In [207]:
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML
    text = emoji.replace_emoji(text, replace="")  # Remove emojis
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = " ".join([word for word in text.split() if word.lower() not in stop_words])  # Remove stopwords
    return text

text["cleaned_text"] = text["text"].apply(clean_text)
text

Unnamed: 0,text,sentiment,cleaned_text
0,I mentioned on Facebook that I was struggling ...,Irrelevant,mentioned Facebook struggling motivation go ru...
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral,BBC News Amazon boss Jeff Bezos rejects claims...
2,@Microsoft Why do I pay for WORD when it funct...,Negative,Microsoft pay WORD functions poorly SamsungUS ...
3,"CSGO matchmaking is so full of closet hacking,...",Negative,CSGO matchmaking full closet hacking truly awf...
4,Now the President is slapping Americans in the...,Neutral,President slapping Americans face really commi...
...,...,...,...
995,⭐️ Toronto is the arts and culture capital of ...,Irrelevant,Toronto arts culture capital Canada wonder wan...
996,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,Irrelevant,ACTUALLY GOOD MOVE TOT BRING VIEWERS one peopl...
997,Today sucked so it’s time to drink wine n play...,Positive,Today sucked time drink wine n play borderland...
998,Bought a fraction of Microsoft today. Small wins.,Positive,Bought fraction Microsoft today Small wins


Tokenization

In [208]:
from nltk.tokenize import word_tokenize

nltk.download('punkt')

text["tokens"] = text["cleaned_text"].apply(lambda x: word_tokenize(x.lower()))
text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yakina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,sentiment,cleaned_text,tokens
0,I mentioned on Facebook that I was struggling ...,Irrelevant,mentioned Facebook struggling motivation go ru...,"[mentioned, facebook, struggling, motivation, ..."
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral,BBC News Amazon boss Jeff Bezos rejects claims...,"[bbc, news, amazon, boss, jeff, bezos, rejects..."
2,@Microsoft Why do I pay for WORD when it funct...,Negative,Microsoft pay WORD functions poorly SamsungUS ...,"[microsoft, pay, word, functions, poorly, sams..."
3,"CSGO matchmaking is so full of closet hacking,...",Negative,CSGO matchmaking full closet hacking truly awf...,"[csgo, matchmaking, full, closet, hacking, tru..."
4,Now the President is slapping Americans in the...,Neutral,President slapping Americans face really commi...,"[president, slapping, americans, face, really,..."
...,...,...,...,...
995,⭐️ Toronto is the arts and culture capital of ...,Irrelevant,Toronto arts culture capital Canada wonder wan...,"[toronto, arts, culture, capital, canada, wond..."
996,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,Irrelevant,ACTUALLY GOOD MOVE TOT BRING VIEWERS one peopl...,"[actually, good, move, tot, bring, viewers, on..."
997,Today sucked so it’s time to drink wine n play...,Positive,Today sucked time drink wine n play borderland...,"[today, sucked, time, drink, wine, n, play, bo..."
998,Bought a fraction of Microsoft today. Small wins.,Positive,Bought fraction Microsoft today Small wins,"[bought, fraction, microsoft, today, small, wins]"


Stemming and Lemmatization

In [209]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

text["stemmed"] = text["tokens"].apply(lambda x: [stemmer.stem(word) for word in x])

text

Unnamed: 0,text,sentiment,cleaned_text,tokens,stemmed
0,I mentioned on Facebook that I was struggling ...,Irrelevant,mentioned Facebook struggling motivation go ru...,"[mentioned, facebook, struggling, motivation, ...","[mention, facebook, struggl, motiv, go, run, d..."
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral,BBC News Amazon boss Jeff Bezos rejects claims...,"[bbc, news, amazon, boss, jeff, bezos, rejects...","[bbc, news, amazon, boss, jeff, bezo, reject, ..."
2,@Microsoft Why do I pay for WORD when it funct...,Negative,Microsoft pay WORD functions poorly SamsungUS ...,"[microsoft, pay, word, functions, poorly, sams...","[microsoft, pay, word, function, poorli, samsu..."
3,"CSGO matchmaking is so full of closet hacking,...",Negative,CSGO matchmaking full closet hacking truly awf...,"[csgo, matchmaking, full, closet, hacking, tru...","[csgo, matchmak, full, closet, hack, truli, aw..."
4,Now the President is slapping Americans in the...,Neutral,President slapping Americans face really commi...,"[president, slapping, americans, face, really,...","[presid, slap, american, face, realli, commit,..."
...,...,...,...,...,...
995,⭐️ Toronto is the arts and culture capital of ...,Irrelevant,Toronto arts culture capital Canada wonder wan...,"[toronto, arts, culture, capital, canada, wond...","[toronto, art, cultur, capit, canada, wonder, ..."
996,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,Irrelevant,ACTUALLY GOOD MOVE TOT BRING VIEWERS one peopl...,"[actually, good, move, tot, bring, viewers, on...","[actual, good, move, tot, bring, viewer, one, ..."
997,Today sucked so it’s time to drink wine n play...,Positive,Today sucked time drink wine n play borderland...,"[today, sucked, time, drink, wine, n, play, bo...","[today, suck, time, drink, wine, n, play, bord..."
998,Bought a fraction of Microsoft today. Small wins.,Positive,Bought fraction Microsoft today Small wins,"[bought, fraction, microsoft, today, small, wins]","[bought, fraction, microsoft, today, small, win]"


In [210]:
from nltk.stem import WordNetLemmatizer
lemmatizer= WordNetLemmatizer()

text['lemmatization'] = text['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
text

Unnamed: 0,text,sentiment,cleaned_text,tokens,stemmed,lemmatization
0,I mentioned on Facebook that I was struggling ...,Irrelevant,mentioned Facebook struggling motivation go ru...,"[mentioned, facebook, struggling, motivation, ...","[mention, facebook, struggl, motiv, go, run, d...","[I, , m, e, n, t, i, o, n, e, d, , o, n, , ..."
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral,BBC News Amazon boss Jeff Bezos rejects claims...,"[bbc, news, amazon, boss, jeff, bezos, rejects...","[bbc, news, amazon, boss, jeff, bezo, reject, ...","[B, B, C, , N, e, w, s, , -, , A, m, a, z, ..."
2,@Microsoft Why do I pay for WORD when it funct...,Negative,Microsoft pay WORD functions poorly SamsungUS ...,"[microsoft, pay, word, functions, poorly, sams...","[microsoft, pay, word, function, poorli, samsu...","[@, M, i, c, r, o, s, o, f, t, , W, h, y, , ..."
3,"CSGO matchmaking is so full of closet hacking,...",Negative,CSGO matchmaking full closet hacking truly awf...,"[csgo, matchmaking, full, closet, hacking, tru...","[csgo, matchmak, full, closet, hack, truli, aw...","[C, S, G, O, , m, a, t, c, h, m, a, k, i, n, ..."
4,Now the President is slapping Americans in the...,Neutral,President slapping Americans face really commi...,"[president, slapping, americans, face, really,...","[presid, slap, american, face, realli, commit,...","[N, o, w, , t, h, e, , P, r, e, s, i, d, e, ..."
...,...,...,...,...,...,...
995,⭐️ Toronto is the arts and culture capital of ...,Irrelevant,Toronto arts culture capital Canada wonder wan...,"[toronto, arts, culture, capital, canada, wond...","[toronto, art, cultur, capit, canada, wonder, ...","[⭐, ️, , T, o, r, o, n, t, o, , i, s, , t, ..."
996,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,Irrelevant,ACTUALLY GOOD MOVE TOT BRING VIEWERS one peopl...,"[actually, good, move, tot, bring, viewers, on...","[actual, good, move, tot, bring, viewer, one, ...","[t, H, I, S, , I, S, , A, C, T, U, A, L, L, ..."
997,Today sucked so it’s time to drink wine n play...,Positive,Today sucked time drink wine n play borderland...,"[today, sucked, time, drink, wine, n, play, bo...","[today, suck, time, drink, wine, n, play, bord...","[T, o, d, a, y, , s, u, c, k, e, d, , s, o, ..."
998,Bought a fraction of Microsoft today. Small wins.,Positive,Bought fraction Microsoft today Small wins,"[bought, fraction, microsoft, today, small, wins]","[bought, fraction, microsoft, today, small, win]","[B, o, u, g, h, t, , a, , f, r, a, c, t, i, ..."


Vectorization: Bag of Words (BoW), Term Frequency - Inverse Document Frequency (TF-IDF), Word2Vec, GloVe

In [211]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(text["cleaned_text"])

In [212]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer() 
X_tfidf = tfidf_vectorizer.fit_transform(text["cleaned_text"])

In [213]:
from gensim.models import Word2Vec

sentences = text["tokens"].tolist()
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

word_vectors = [word2vec_model.wv[word] for word in text["tokens"][0] if word in word2vec_model.wv]

Naive Bayes, SVM, Random Forest algorithm testing

In [214]:
from sklearn.model_selection import train_test_split

X = X_tfidf  
y = text["sentiment"] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [215]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred))


Naive Bayes Accuracy: 0.545


In [216]:
from sklearn.svm import SVC

svm = SVC(kernel="linear")
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

SVM Accuracy: 0.53


In [217]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Accuracy: 0.43
