In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import string
from string import punctuation
import nltk
import re

In [None]:
df = pd.read_csv('/content/NLP_Project_translates(likes,comments).csv')
df

Unnamed: 0,Index,Likes,Comment
0,1,0,"Just everyone, stop picking out the difference..."
1,2,0,What gives me the right to judge a trans-gende...
2,3,0,they dont want to exploit they want to depopul...
3,4,0,"Your birth certificate determines your gender,..."
4,5,0,We’re having these stupid debates because of t...
...,...,...,...
1564,1565,0,"When he said that ""we dont fight with Meiteis ..."
1565,1566,0,Speaking lies will not help in any situation.
1566,1567,0,meiteis were lie..they have 5000 looted automa...
1567,1568,0,all problems will be solved if the central gov...


In [None]:
df.drop(['Index','Likes'], axis=1, inplace=True)
df

Unnamed: 0,Comment
0,"Just everyone, stop picking out the difference..."
1,What gives me the right to judge a trans-gende...
2,they dont want to exploit they want to depopul...
3,"Your birth certificate determines your gender,..."
4,We’re having these stupid debates because of t...
...,...
1564,"When he said that ""we dont fight with Meiteis ..."
1565,Speaking lies will not help in any situation.
1566,meiteis were lie..they have 5000 looted automa...
1567,all problems will be solved if the central gov...


In [None]:
def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
df['Comment'] = df['Comment'].apply(remove_emojis)

In [None]:
df

Unnamed: 0,Comment
0,"Just everyone, stop picking out the difference..."
1,What gives me the right to judge a trans-gende...
2,they dont want to exploit they want to depopul...
3,"Your birth certificate determines your gender,..."
4,We’re having these stupid debates because of t...
...,...
1564,"When he said that ""we dont fight with Meiteis ..."
1565,Speaking lies will not help in any situation.
1566,meiteis were lie..they have 5000 looted automa...
1567,all problems will be solved if the central gov...


In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
stop_words = stopwords.words('english')
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
snowball_stemer = SnowballStemmer(language="english")
lzr = WordNetLemmatizer()

In [None]:
def text_processing(text):
    # convert text into lowercase
    text = text.lower()

    # remove new line characters in text
    text = re.sub(r'\n',' ', text)

    # remove punctuations from text
    text = re.sub('[%s]' % re.escape(punctuation), "", text)

    # remove references and hashtags from text
    text = re.sub("^a-zA-Z0-9$,.", "", text)

    # remove multiple spaces from text
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    # remove special characters from text
    text = re.sub(r'\W', ' ', text)

    text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])

    # stemming using porter stemmer from nltk package - msh a7sn 7aga - momken: lancaster, snowball
    # text=' '.join([porter_stemmer.stem(word) for word in word_tokenize(text)])
    # text=' '.join([lancaster_stemmer.stem(word) for word in word_tokenize(text)])
    # text=' '.join([snowball_stemer.stem(word) for word in word_tokenize(text)])

    # lemmatizer using WordNetLemmatizer from nltk package
    text=' '.join([lzr.lemmatize(word) for word in word_tokenize(text)])

    return text

In [None]:
df_copy = df.copy()
df_copy.Comment = df_copy.Comment.apply(lambda text: text_processing(text))

In [None]:
df_copy

Unnamed: 0,Comment
0,everyone stop picking difference world would s...
1,give right judge transgendered female simple i...
2,dont want exploit want depopulate world coz co...
3,birth certificate determines gender call want ...
4,stupid debate stupid people want identify some...
...,...
1564,said dont fight meiteis fight u really thought...
1565,speaking lie help situation
1566,meiteis liethey 5000 looted automatic weapon 6...
1567,problem solved central government issue nrc no...


In [None]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
sentiments = SentimentIntensityAnalyzer()
df_copy["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in df_copy["Comment"]]
df_copy["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in df_copy["Comment"]]
df_copy["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in df_copy["Comment"]]
df_copy['Compound'] = [sentiments.polarity_scores(i)["compound"] for i in df_copy["Comment"]]
score = df_copy["Compound"].values
sentiment = []
for i in score:
    if i >= 0.05 :
        sentiment.append('Positive')
    elif i <= -0.05 :
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
df_copy["Sentiment"] = sentiment
df_copy.head()

Unnamed: 0,Comment,Positive,Negative,Neutral,Compound,Sentiment
0,everyone stop picking difference world would s...,0.082,0.235,0.683,-0.5209,Negative
1,give right judge transgendered female simple i...,0.0,0.0,1.0,0.0,Neutral
2,dont want exploit want depopulate world coz co...,0.109,0.389,0.503,-0.3287,Negative
3,birth certificate determines gender call want ...,0.14,0.0,0.86,0.0772,Positive
4,stupid debate stupid people want identify some...,0.137,0.466,0.397,-0.8779,Negative


In [None]:
df2 = df_copy.drop(['Positive','Negative','Neutral', 'Compound'],axis=1)
df2.head()

Unnamed: 0,Comment,Sentiment
0,everyone stop picking difference world would s...,Negative
1,give right judge transgendered female simple i...,Neutral
2,dont want exploit want depopulate world coz co...,Negative
3,birth certificate determines gender call want ...,Positive
4,stupid debate stupid people want identify some...,Negative


In [None]:
le = LabelEncoder()
df2['Sentiment'] = le.fit_transform(df2["Sentiment"])

In [None]:
processed_data = {
    'Comment':df2.Comment,
    'Sentiment':df2['Sentiment']
}

processed_data = pd.DataFrame(processed_data)
processed_data.head()

Unnamed: 0,Comment,Sentiment
0,everyone stop picking difference world would s...,0
1,give right judge transgendered female simple i...,1
2,dont want exploit want depopulate world coz co...,0
3,birth certificate determines gender call want ...,2
4,stupid debate stupid people want identify some...,0


In [None]:
print(processed_data['Sentiment'].value_counts())
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:")
print(label_mapping)

0    560
2    554
1    455
Name: Sentiment, dtype: int64
Label Mapping:
{'Negative': 0, 'Neutral': 1, 'Positive': 2}


In [None]:
df_negative = processed_data[(processed_data['Sentiment']==0)]
df_neutral = processed_data[(processed_data['Sentiment']==1)]
df_positive = processed_data[(processed_data['Sentiment']==2)]

In [None]:
df_neutral_upsampled = resample(df_neutral,
                                 replace=True,
                                 n_samples= 560,
                                 random_state=42)

df_positive_upsampled = resample(df_positive,
                                 replace=True,
                                 n_samples= 560,
                                 random_state=42)


# Concatenate the upsampled dataframes with the neutral dataframe
final_data = pd.concat([df_negative,df_neutral_upsampled,df_positive_upsampled])

In [None]:
final_data['Sentiment'].value_counts()
final_data

Unnamed: 0,Comment,Sentiment
0,everyone stop picking difference world would s...,0
2,dont want exploit want depopulate world coz co...,0
4,stupid debate stupid people want identify some...,0
6,fact conversation scare simple there men woman...,0
12,find difficult people actually born wrong gend...,0
...,...,...
1293,get contact number need open tell give training,2
1201,super sir,2
57,406 part ive watched sooo many time always end...,2
1254,sir told well said form filled,2


In [None]:
corpus = []
for sentence in final_data['Comment']:
    corpus.append(sentence)
corpus[0:5]

['everyone stop picking difference world would solve absolutely everything racial problem identify etc world really gone way far',
 'dont want exploit want depopulate world coz confusion limit reproduction',
 'stupid debate stupid people want identify something u expect lilly rose plant like seriously present world scenario fucked',
 'fact conversation scare simple there men woman cosplayers people transgender legally give thumb gender born absurd',
 'find difficult people actually born wrong gender suffer enormously life entering operation gender changed mean must really difficult watch movie danish girl topic completely different thing somebody deciding feel like woman one day refuse use term workperson wont go madness cant use word boy girl cause someone get insulted']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = final_data.iloc[:, -1].values

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[47,  6,  5],
       [ 4, 50,  2],
       [22,  5, 27]])

In [None]:
nb_score = accuracy_score(y_test, y_pred)
print('accuracy',nb_score)

accuracy 0.7380952380952381


In [None]:
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
multinomial_nb_classifier = MultinomialNB()
multinomial_nb_classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
cm1 = confusion_matrix(y_test, y_pred)
cm1

array([[115,   6,   1],
       [  0, 107,   1],
       [ 33,   8,  65]])

In [None]:
nb_score = accuracy_score(y_test, y_pred)
print('accuracy',nb_score)

accuracy 0.8541666666666666


In [None]:
from sklearn.naive_bayes import BernoulliNB
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
bernoulli_nb_classifier = BernoulliNB()
bernoulli_nb_classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
cm2 = confusion_matrix(y_test, y_pred)
cm2

array([[115,   6,   1],
       [  0, 107,   1],
       [ 33,   8,  65]])

In [None]:
nb_score = accuracy_score(y_test, y_pred)
print('accuracy',nb_score)

accuracy 0.8541666666666666
