In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from gensim.models.word2vec import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [35]:
df=pd.read_csv("/content/twitter_training.csv")
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [36]:
df.value_counts().sum()

73995

Data Preprocessing

In [37]:
df.columns=['id','label','sentiment','text']

In [38]:
df.drop(columns=['id','label'],inplace=True)

In [39]:
df.head()

Unnamed: 0,sentiment,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [40]:
print(df.dtypes)

sentiment    object
text         object
dtype: object


In [41]:
negative_df=df[(df['sentiment']=='Negative')]
positive_df=df[df['sentiment']=='Positive']
neutral_df=df[df['sentiment']=='Neutral']


In [42]:
negative_df.value_counts().sum()

22358

In [43]:
classes=pd.DataFrame(df.groupby('sentiment').size().reset_index(name='count'))
display(classes)
print(classes.columns)

Unnamed: 0,sentiment,count
0,Irrelevant,12990
1,Negative,22542
2,Neutral,18318
3,Positive,20831


Index(['sentiment', 'count'], dtype='object')


In [44]:
fig=px.bar(classes,x='sentiment',y='count',title='Sentiment Analysis')
fig.show()

In [45]:
def preprocess_text(text):
    text = str(text)
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    text = text.lower()
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(' ', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    filtered_words = [word for word in text.split() if word not in stop_words]
    contractions = r"(\b(?:U\.S|Dr|Mrs|Mr|i\.e|e\.g)\.|\b(?:I'm|you're|they're|it's|doesn't|didn't)\b)"
    text = re.sub(contractions, lambda x: x.group(0).replace('.', '<PERIOD>'), text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('<PERIOD>', '.')
    tokens = word_tokenize(text)

    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    return ' '.join(stemmed_words)

In [46]:
df['text']=df['text'].apply(preprocess_text)

In [47]:
df=df[df['sentiment']!='Irrelevant']


In [48]:
## word embeddings using word2vec
tokenized_sentences=[sentence.split() for sentence in df['text']]
model=Word2Vec(
    tokenized_sentences,
    vector_size=124,
    window=3,
    min_count=1,
    sg=0,
    hs=0,
    negative=3,

    seed=42
)
model.save('word2vec.model')

In [49]:
vector=model.wv['game']
print(vector)

[ 0.4607846   0.43180534  0.09060197 -0.26190895  0.928515   -1.1430717
 -1.2260845   0.25637534 -0.00538889 -0.04467089 -0.2139939  -0.6064433
 -0.6703867   0.6691535  -0.6051628  -0.2864166  -0.43625468 -0.85733587
  1.3717426   0.12179438 -0.10692293  0.74318093 -1.4276263   0.83887064
  0.5519364  -0.20704967 -0.7580093   0.51964086 -0.46201715  0.65478647
 -0.31657028 -0.00241484 -1.2135249   0.2625398  -0.31715775 -0.05762023
 -0.4432319   0.8963749   0.25307325  0.38807714 -0.6984603  -0.2861553
 -0.4167349   0.48817384 -0.8557047   0.55937314 -0.23377089  0.11595926
  0.40667573  0.6163373  -0.8072136  -0.7301107   0.01134639 -0.19329292
  0.4418667  -0.3413339   1.1143571  -0.55857086  0.02440971  0.4306212
  0.43965238  0.8259378  -0.38844717  0.56636095 -0.76062     0.3792356
 -1.0794433   0.19009371 -0.54882777 -0.31727603  0.242561    0.39317274
 -0.01351732 -1.4443259   0.7563603  -0.06237847  0.73378074 -0.39187995
  0.78864217  0.25814888 -0.04511887 -0.5125802  -1.0033

In [50]:
def get_sentence_embedding(sentence):
    words=sentence.split()
    embeddings=[]
    # Access word vectors using model.wv[word]
    word_vectors=[model.wv[word] for word in words if word in model.wv]
    if len(word_vectors)>0:
        sentence_embedding=np.mean(word_vectors,axis=0)
    else:
        sentence_embedding=np.zeros(model.vector_size)
    return sentence_embedding

In [51]:
df['embedded'] = df['text'].apply(get_sentence_embedding)
df

Unnamed: 0,sentiment,text,embedded
0,Positive,come border kill,"[0.5603331, -0.10456856, 0.011734975, -0.13234..."
1,Positive,im get borderland kill,"[0.8175131, -0.042812422, -0.1934607, -0.51753..."
2,Positive,im come borderland murder,"[0.7544334, -0.13389954, -0.10355122, -0.40914..."
3,Positive,im get borderland 2 murder,"[0.8340379, -0.17756537, -0.112977505, -0.4618..."
4,Positive,im get borderland murder,"[0.7112045, -0.04655292, -0.13897565, -0.43674..."
...,...,...,...
74676,Positive,realiz window partit mac like 6 year behind nv...,"[0.28969136, -0.1818342, 0.13205509, 0.0168425..."
74677,Positive,realiz mac window partit 6 year behind nvidia ...,"[0.2875464, -0.21330959, 0.13715516, 0.0737188..."
74678,Positive,realiz window partit mac 6 year behind nvidia ...,"[0.2875464, -0.21330959, 0.13715516, 0.0737188..."
74679,Positive,realiz window partit mac like 6 year behind nv...,"[0.38977087, -0.1052506, 0.07870153, 0.0528664..."


In [55]:
sentiment_mapping = {'Positive': 1, 'Negative': -1, 'Neutral': 0}
df['sentiment_encoded'] = df['sentiment'].map(sentiment_mapping)


In [56]:
x=df['embedded']
y=df['sentiment_encoded']
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)


In [75]:
X=df['embedded'].to_numpy()
X = np.vstack(X)  # Reshape X into a 2D array
y=df['sentiment']
X_train,X_test,y_train,y_test=train_test_split(
    X,y,test_size=0.4,random_state=42
)
logistic_model=LogisticRegression()
logistic_model.fit(X_train,y_train)
y_pred=logistic_model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy:{accuracy}")

Accuracy:0.6089070794667099



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [97]:
def get_sentence_embedding(sentence):
    words=sentence.split()
    word2vec='word2vec.model'
    # Access word vectors using model.wv[word]
    word_vectors=[model.wv[word] for word in words if word in model.wv]
    if len(word_vectors)>0:
        sentence_embedding=np.mean(word_vectors,axis=0)
    else:
        sentence_embedding=np.zeros(model.vector_size)
    return sentence_embedding
def predict_sentiment(model,sentence):
    import joblib
    loaded_model = joblib.load('logistic_model.pkl')
    sentence=preprocess_text(sentence)
    print(sentence)
    sentence_embedding=get_sentence_embedding(sentence)
    sentence_embedding=sentence_embedding.reshape(1,-1)
    sentiment=model.predict(sentence_embedding)
    return sentiment[0]

In [98]:
predict_sentiment(logistic_model,"Rocket League, Sea of Thieves or Rainbow Six: Siege🤔? I love playing all three on stream but which is the best? #stream #twitch #RocketLeague #SeaOfThieves #RainbowSixSiege #follow")

rocket leagu sea thiev rainbow six sieg love play three stream best stream twitch rocketleagu seaofthiev rainbowsixsieg follow


'Positive'