In [1]:
# Importing the dataset

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
# Importing the dataset

In [4]:
dataset = pd.read_csv('sentiment_data.csv', quoting = 3)

In [5]:
# Cleaning the texts by Stemming

In [7]:
import re 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 16000):
    text = re.sub('[^a-zA-Z]', ' ', dataset['Text'][i]) 
    text = text.lower()  
    text = text.split()  
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    text = [ps.stem(word) for word in text if not word in set(all_stopwords)] 
    text = ' '.join(text)   
    corpus.append(text)    

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/trishit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
print(corpus)

['didnt feel humili', 'go feel hopeless damn hope around someon care awak', 'im grab minut post feel greedi wrong', 'ever feel nostalg fireplac know still properti', 'feel grouchi', 'ive feel littl burden late wasnt sure', 'ive take milligram time recommend amount ive fallen asleep lot faster also feel like funni', 'feel confus life teenag jade year old man', 'petrona year feel petrona perform well made huge profit', 'feel romant', 'feel like make suffer see mean someth', 'feel run divin experi expect type spiritu encount', 'think easiest time year feel dissatisfi', 'feel low energi thirsti', 'immens sympathi gener point possibl proto writer tri find time write corner life sign agent let alon publish contract feel littl preciou', 'not feel reassur anxieti side', 'didnt realli feel embarrass', 'feel pretti pathet time', 'start feel sentiment doll child began collect vintag barbi doll sixti', 'feel compromis skeptic valu everi unit work put', 'feel irrit reject without anyon anyth say an

In [9]:
# Creating the Bag of Words model

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2500)
X = cv.fit_transform(corpus).toarray() 
y = dataset.iloc[:, -1].values

In [11]:
len(X[0])

2500

In [15]:
X.shape

(16000, 2500)

In [16]:
y.shape

(16000,)

In [12]:
# Label Encoding the categorical data

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [14]:
print(y)

[4 4 0 ... 2 0 4]


In [35]:
le.classes_

array(['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
      dtype=object)

In [17]:
# Splitting the dataset into training and test set

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
# Training the Random Forest Classification model on the training set

In [19]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy')
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
# Predicting the test set results

In [21]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[0 4]
 [2 2]
 [4 4]
 ...
 [4 4]
 [3 3]
 [1 1]]


In [22]:
# Making the Confusion matrix

In [23]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[391   9  18   9  30   2]
 [ 35 327   7   5   5  18]
 [ 37  17 872  60  50  17]
 [  6   3  61 198   3   0]
 [ 45  30  46  20 758   6]
 [  0  11   1   3   2  98]]


In [24]:
# Evaluating the accuracy of the model

In [25]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.82625

In [30]:
# Saving the trained model

In [31]:
import pickle
pickle_out = open("trained_model.pkl", "wb")
pickle.dump(classifier, pickle_out)
pickle_out.close()

In [32]:
pickle_in = open('trained_model.pkl', 'rb')
classifier = pickle.load(pickle_in)

In [39]:
# Defining the function to make predictions on a unknown text

In [40]:
sentiment_classes = ['angry','fear','happy','love','sad','surprise']
def predictSentiment(new_text):
    new_text = re.sub('[^a-zA-Z]', ' ', new_text) # replacing everything which is not a letter by a space
    new_text = new_text.lower()  # Converting the text to lower-case
    new_text = new_text.split()  # Splitting the review text into words
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    new_text = [ps.stem(word) for word in new_text if not word in set(all_stopwords)] # performing stemming for each and every words in the review except the stopwords
    new_text = ' '.join(new_text)
    new_corpus = [new_text]
    new_X_test = cv.transform(new_corpus).toarray()
    new_y_pred = classifier.predict(new_X_test)
    new_y_pred = int(new_y_pred)
    print("Predicted sentiment: ", sentiment_classes[new_y_pred])

In [41]:
# Making predictions of the sentiments on unknown text

In [42]:
text = 'I feel so lonely'
predictSentiment(text)

Predicted sentiment:  sad


In [43]:
text = 'I am feeling excited'
predictSentiment(text)

Predicted sentiment:  happy


In [47]:
text = 'I am very nervous'
predictSentiment(text)

Predicted sentiment:  fear
