In [1]:
# Importing the Required modules
import numpy as np 
import pandas as pd

In [2]:
sad = pd.read_csv('sad.csv')
happy= pd.read_csv('happy.csv')
angry=pd.read_csv('angry.csv')

In [3]:
sad.shape,happy.shape,angry.shape

((635, 2), (708, 2), (696, 2))

In [4]:
sad.head()

Unnamed: 0,content,sentiment
0,"Never hurt people who love you a lot, because ...",sad
1,Don’t expect me to tell you what you did wrong...,sad
2,I preferred walking away than fighting for you...,sad
3,"Moving forward in life isn’t the hard part, it...",sad
4,"Never cry for anyone in your life, because tho...",sad


In [5]:
sad = sad.drop_duplicates(subset='content', keep="first")

In [6]:
happy = happy.drop_duplicates(subset='content', keep="first")

In [7]:
angry = angry.drop_duplicates(subset='content', keep="first")

In [8]:
frames = [sad, happy, angry]

df = pd.concat(frames)

In [9]:
df.shape

(1592, 2)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1592 entries, 0 to 694
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   content    1592 non-null   object
 1   sentiment  1592 non-null   object
dtypes: object(2)
memory usage: 37.3+ KB


In [11]:
df.head()

Unnamed: 0,content,sentiment
0,"Never hurt people who love you a lot, because ...",sad
1,Don’t expect me to tell you what you did wrong...,sad
2,I preferred walking away than fighting for you...,sad
3,"Moving forward in life isn’t the hard part, it...",sad
4,"Never cry for anyone in your life, because tho...",sad


In [12]:
df = df.drop_duplicates(subset='content', keep="first")

In [13]:
df.shape

(1586, 2)

In [14]:
df['sentiment'].value_counts()

happy    702
angry    494
sad      390
Name: sentiment, dtype: int64

# Replacing categorical values as :
 `happy ----> 1`

 `angry ----> 0`

 `sad   ----> 2

In [15]:
df['sentiment'].replace({'happy':1,'angry':0,'sad':2},inplace=True)

In [16]:
df['sentiment'].value_counts()

1    702
0    494
2    390
Name: sentiment, dtype: int64

### Following basic steps are required for nlp :
* 1.Cleaning html tag
* 2.Converting to lower
* 3.Remove special characters
* 4.Remove stop words
* 5.Perform stemming

In [17]:
## cleaning html tag

import re
def clean_html(text):
    
    clean = re.compile('<.*?>')
    return re.sub(clean, '',text)
    
df['content']=df['content'].apply(clean_html)

In [18]:
## converting to lower 

def convert_lower(text):
    return text.lower()

df['content']=df['content'].apply(convert_lower)

In [19]:
def remove_special(text):
        x=''
        for i in text:
            if i.isalnum():
                x=x+i
            else:
                x=x+' '
        return x

df['content']=df['content'].apply(remove_special)

In [20]:
import nltk
from nltk.corpus import stopwords

def remove_stopwords(text):
    x=[]
    for i in text.split():
        
        if i not in stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    return y
df['content']=df['content'].apply(remove_stopwords)

In [21]:
def join_back(list_input):
    return " ".join(list_input)
    

df['content']=df['content'].apply(join_back)

In [22]:
import nltk

from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()
y=[]

def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z
df['content']=df['content'].apply(stem_words)

In [23]:
def joinback2(list_input):
    return "".join(list_input)
    


df['content']=df['content'].apply(joinback2)

In [24]:
df.head()

Unnamed: 0,content,sentiment
0,never hurt people love lot hurt back probably ...,2
1,expect tell wrong figure ready correct cos kno...,2
2,preferred walking away fighting worth fighting...,2
3,moving forward life hard part leaving behind s...,2
4,never cry anyone life cry deserve tears deserv...,2


In [25]:
X=df['content']

In [26]:
X.shape

(1586,)

In [27]:
y=df.iloc[:,-1].values

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.3) 

In [29]:
X_train.shape, X_test.shape, y_train.shape,y_test.shape

((1110,), (476,), (1110,), (476,))

### 1.Using CountVectorizer

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1500)

In [31]:
X_train = cv.fit_transform(X_train).toarray()
X_test=cv.transform(X_test).toarray()

In [32]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
clf1=GaussianNB()
clf2=MultinomialNB()
clf3=BernoulliNB()

In [33]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)

In [34]:
y_pred1=clf1.predict(X_test)
y_pred2=clf2.predict(X_test)
y_pred3=clf3.predict(X_test)

In [35]:
from sklearn.metrics import accuracy_score

print("Gaussian",accuracy_score(y_test,y_pred1))
print("Multinomial",accuracy_score(y_test,y_pred2))
print("Bernaulli",accuracy_score(y_test,y_pred3))

Gaussian 0.47478991596638653
Multinomial 0.6995798319327731
Bernaulli 0.6302521008403361


In [36]:
test_sad="i am sad & depressed too"
test_happy= "i am very cheerful"
test_angry="he is annoying"

In [37]:
test_sad = np.array([test_sad])
test_sad = cv.transform(test_sad)

test_happy = np.array([test_happy])
test_happy = cv.transform(test_happy)

test_angry = np.array([test_angry])
test_angry = cv.transform(test_angry)

In [38]:
print(clf2.predict(test_sad),clf2.predict(test_happy),clf2.predict(test_angry))

[2] [1] [1]


#### 2 is Sad ,1 is Happy, 0 is angry

In [39]:
X1=df['content']
y=df.iloc[:,-1].values

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X1,y,test_size=0.3) 

### 2.Usiing TfidfVectorizer

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(min_df= 1, max_features=1500, strip_accents='unicode',analyzer='word',ngram_range=(1,3),stop_words='english')

In [42]:
X_train = tv.fit_transform(X_train).toarray()
X_test = tv.transform(X_test).toarray()

In [43]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
clf1=GaussianNB()
clf2=MultinomialNB()
clf3=BernoulliNB()

In [44]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)
y_pred1=clf1.predict(X_test)
y_pred2=clf2.predict(X_test)
y_pred3=clf3.predict(X_test)

In [45]:
print("Gaussian",accuracy_score(y_test,y_pred1))
print("Multinomial",accuracy_score(y_test,y_pred2))
print("Bernaulli",accuracy_score(y_test,y_pred3))

Gaussian 0.5105042016806722
Multinomial 0.6218487394957983
Bernaulli 0.5441176470588235


In [46]:
test="just go to jail"

In [47]:
test = np.array([angry['content'][0]])
test = tv.transform(test)

In [48]:
clf2.predict(test)

array([0], dtype=int64)

### 0 indicates Angry 