### Emotion Classifier

#### Dataset

The dataset set contained in four text files consists of tweets for four different emotions: anger, fear, joy and sadness.<br>

Along with the tweet, the intensity or degree of emotion X felt by the speaker (a real-valued score between 0 and 1) is also provided. <br>

The maximum possible score 1 stands for feeling the maximum amount of emotion X (or having a mental state maximally inclined towards feeling emotion X). The minimum possible score 0 stands for feeling the least amount of emotion X (or having a mental state maximally away from feeling emotion X). 

#### Goals: 
i) To classify a given tweet into one of the four classes: anger, fear, joy or sadness. <br>
ii) To display the degree of the classified emotion in the tweet.

Installing required package:<br>
```
pip3 install nltk
 (or)
pip install nltk
```

In [1]:
import nltk    

In [2]:
from pandas import DataFrame
import pandas as pd

data = [] # Tweets
data_labels = [] # Emotion label (anger, fear, joy, or sadness)
data_int = [] # Intensityy of each emotion

dataset=pd.read_csv("training_set/anger-ratings-0to1.train.txt",delimiter="\t",names=['id','tweet','emotion','intensity'])

# Display first few examples
pd.set_option('display.max_colwidth', -1)
dataset.head()

Unnamed: 0,id,tweet,emotion,intensity
0,10000,How the fu*k! Who the heck! moved my fridge!... should I knock the landlord door. #angry #mad ##,anger,0.938
1,10001,So my Indian Uber driver just called someone the N word. If I wasn't in a moving vehicle I'd have jumped out #disgusted,anger,0.896
2,10002,@DPD_UK I asked for my parcel to be delivered to a pick up store not my address #fuming #poorcustomerservice,anger,0.896
3,10003,so ef whichever butt wipe pulled the fire alarm in davis bc I was sound asleep #pissed #angry #upset #tired #sad #tired #hangry ######,anger,0.896
4,10004,"Don't join @BTCare they put the phone down on you, talk over you and are rude. Taking money out of my acc willynilly! #fuming",anger,0.896


#### Reading the tweets and their corresponding emotion and intensity

In [124]:
from pandas import DataFrame
import pandas as pd

data = [] # Tweets
data_labels = [] # Emotion label (anger, fear, joy, or sadness)
data_int = [] # Intensityy of each emotion

dataset=pd.read_csv("training_set/anger-ratings-0to1.train.txt",delimiter="\t",names=['id','tweet','emotion','intensity'])
for i in range(len(dataset)):
    data.append(dataset.iat[i,1])
    data_labels.append('anger')
    data_int.append(dataset.iat[i,3])
    
dataset=pd.read_csv("training_set/fear-ratings-0to1.train.txt",delimiter="\t",names=['id','tweet','emotion','intensity'])
for i in range(len(dataset)):
    data.append(dataset.iat[i,1])
    data_labels.append('fear')
    data_int.append(dataset.iat[i,3])

dataset=pd.read_csv("training_set/joy-ratings-0to1.train.txt",delimiter="\t",names=['id','tweet','emotion','intensity'])
for i in range(len(dataset)):
    data.append(dataset.iat[i,1])
    data_labels.append('joy')
    data_int.append(dataset.iat[i,3])

dataset=pd.read_csv("training_set/sadness-ratings-0to1.train.txt",delimiter="\t",names=['id','tweet','emotion','intensity'])
for i in range(len(dataset)):
    data.append(dataset.iat[i,1])
    data_labels.append('sadness')
    data_int.append(dataset.iat[i,3])

#### Shuffling the data

In [125]:
from random import shuffle
dv = []
dl = []
di = []
index_shuf = list(range(len(data)))
shuffle(index_shuf)
for i in index_shuf:
    dv.append(data[i])
    dl.append(data_labels[i])
    di.append(data_int[i])
data = dv
data_labels = dl
data_int = di


#### Feature extraction using CountVectorizer

In [126]:
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer(
    analyzer = 'word',
    lowercase = False,
)


#### An example using CountVectorizer

In [127]:
example = ['this is great','This is too great to be great','THIS IS GREAT!']
print(example)

['this is great', 'This is too great to be great', 'THIS IS GREAT!']


In [128]:
features_eg = vectorizer.fit_transform(
    example
)
features_nd_eg = features_eg.toarray() # for easy usage
print(vectorizer.get_feature_names())
print(features_nd_eg)

[u'GREAT', u'IS', u'THIS', u'This', u'be', u'great', u'is', u'this', u'to', u'too']
[[0.         0.         0.         0.         0.         0.51785612
  0.51785612 0.68091856 0.         0.        ]
 [0.         0.         0.         0.38091445 0.38091445 0.57939052
  0.28969526 0.         0.38091445 0.38091445]
 [0.57735027 0.57735027 0.57735027 0.         0.         0.
  0.         0.         0.         0.        ]]


#### Extracting features from tweets

In [129]:
features = vectorizer.fit_transform(
    data
)
features_nd = features.toarray() # for easy usage

In [130]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(
        features_nd, 
        data_labels,
        train_size=0.80, test_size=0.20, 
        random_state=1234)

In [131]:
import numpy as np
print(np.shape(features_nd))
print((vectorizer.transform(["I love to "])))

(3613, 11239)
  (0, 10479)	0.4314976665434786
  (0, 7584)	0.902114052527469


### Linear Classifier

In [132]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()

In [133]:
log_model = log_model.fit(X=X_train, y=y_train)

In [134]:
y_pred = log_model.predict(X_test)
y_pred[12]

'fear'

In [135]:
import numpy as np
np.mean(y_pred==y_test)

0.7800829875518672

### Accuracy

In [93]:
# Printing the predictions for some random test data
import random

j = random.randint(0,len(X_test)-7)
for i in range(j,j+7):
    ind = features_nd.tolist().index(X_test[i].tolist())
    print(y_pred[i],":",data[ind].strip())

('fear', ':', 'Because it was a perfect illusion, but at least now I know what it was.  #ladygaga #iscalming#mysoul')
('sadness', ':', "And I won't even get started with Hillary and her fancy fundraisers! #depressing")
('fear', ':', 'induction day tomorrow for pizza express')
('fear', ':', 'a panic attack AND CALL YOURSELF A REAL FAN makes me so mad like i dont even have the words to explain. this is why some people give no +')
('joy', ':', '@alphavenger all chuck seasons, she had that one bright spot with dan, where she was allowed to be smart and kind //and// fashionable')
('anger', ':', "Don't be bitter")
('anger', ':', '@ChurdAllan fucked my coupon that goal!')


In [94]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7579529737206085


## Exercise
```
There are two sets each containing 4 files for each emotion provided for training and development. 
Combine these two sets for training and use 5-fold cross-validation 
to find out the Accuracy in all the cases mentioned below.
```

1. Calculate the accuracy using Random Forest Classifier and tune the number of estimators to get the best results. Comment on the same.
2. Now use Logistic Regression and observe the accuracy value. Can the performance be further improved by using L1 and L2 regularizations?
3. Repeat the same using Support Vector Classifier.
4. Estimate the training & testing time for each classifier and comment on the results.
5. Now, the emotion intensity score for each tweet is to be found on top of classification. To do this, fit different regression models on the training set for each emotion and find the emotion intensity score for each of the test set. Also, display mean square error for test set.
6. In all the above cases, create a user-defined function, which takes a tweet (text) as input and displays the predicted emotion.
7. A separate test set is provided. Use one of the classification models implemented earlier to determine the corresponding emotion for each tweet in this set. Use the linear regression models to calculate the emotional intensity.

```In all the above cases, use a feature extractor other than CountVectorizer and observe performance.```

In [136]:


dataset=pd.read_csv("dev_set/anger-ratings-0to1.dev.gold.txt",delimiter="\t",names=['id','tweet','emotion','intensity'])
for i in range(len(dataset)):
    data.append(dataset.iat[i,1])
    data_labels.append('anger')
    data_int.append(dataset.iat[i,3])
    
dataset=pd.read_csv("dev_set/fear-ratings-0to1.dev.gold.txt",delimiter="\t",names=['id','tweet','emotion','intensity'])
for i in range(len(dataset)):
    data.append(dataset.iat[i,1])
    data_labels.append('fear')
    data_int.append(dataset.iat[i,3])

dataset=pd.read_csv("dev_set/joy-ratings-0to1.dev.gold.txt",delimiter="\t",names=['id','tweet','emotion','intensity'])
for i in range(len(dataset)):
    data.append(dataset.iat[i,1])
    data_labels.append('joy')
    data_int.append(dataset.iat[i,3])

dataset=pd.read_csv("dev_set/sadness-ratings-0to1.dev.gold.txt",delimiter="\t",names=['id','tweet','emotion','intensity'])
for i in range(len(dataset)):
    data.append(dataset.iat[i,1])
    data_labels.append('sadness')
    data_int.append(dataset.iat[i,3])

In [137]:
from random import shuffle
dv = []
dl = []
di = []
index_shuf = list(range(len(data)))
shuffle(index_shuf)
for i in index_shuf:
    dv.append(data[i])
    dl.append(data_labels[i])
    di.append(data_int[i])
data = dv
data_labels = dl
data_int = di

from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer(
    analyzer = 'word',
    lowercase = False,
)

features = vectorizer.fit_transform(
    data
)
features_nd = features.toarray() # for easy usage

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(
        features_nd, 
        data_labels,
        train_size=0.80, test_size=0.20, 
        random_state=1234)



In [138]:
features_anger = []
features_fear = []
features_joy = []
features_sadness = []

data_int_anger = []
data_int_fear = []
data_int_joy = []
data_int_sadness = []

for i in range(0,len(data)):
    if data_labels[i] == 'anger':
        features_anger.append(features_nd[i])
        data_int_anger.append(data_int[i])
    if data_labels[i] == 'fear':
        features_fear.append(features_nd[i])
        data_int_fear.append(data_int[i])
    if data_labels[i] == 'sadness':
        features_sadness.append(features_nd[i])
        data_int_sadness.append(data_int[i])
    if data_labels[i] == 'joy':
        features_joy.append(features_nd[i])
        data_int_joy.append(data_int[i])

sum(features_joy[8])


4.323373885956241

# RandomForestClassifier

In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import time

start = time.time()
clf = RandomForestClassifier()
scores = cross_val_score(clf, X_train, y_train, cv=5)
clf.fit(X_train,y_train)
end = time.time()
print(scores)
print("time taken:", end-start)

[0.73858268 0.74921136 0.74921136 0.73933649 0.75316456]
('time taken:', 13.826820850372314)


In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

start = time.time()
X_train, X_test, y_train, y_test  = train_test_split(
        features_nd, 
        data_labels,
        train_size=0.80, test_size=0.20, 
        random_state=1234)

clf = RandomForestClassifier()
clf.fit(X_train,y_train)
end = time.time()

print(clf.score(X_test,y_test))
print(end - start)

0.797979797979798
3.09264802933


# Grid Search on Random Forest Classifier

In [48]:
from sklearn.grid_search import GridSearchCV
start = time.time()
param_grid = {'n_estimators' : [10, 20, 50,100,150,200]}
gridsearch_rf = GridSearchCV(clf, param_grid, n_jobs=-1,cv=5)
gridsearch_rf.fit(X_train,y_train)
print(gridsearch_rf.best_params_)
print(gridsearch_rf.best_score_)
end = time.time()
print("Time taken:"end-start)




{'n_estimators': 100}
0.833017676768


# Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

import time

X_train, X_test, y_train, y_test  = train_test_split(
        features_nd, 
        data_labels,
        train_size=0.80, test_size=0.20, 
        random_state=1234)

start = time.time()
lr.fit(X_train,y_train)
scores = cross_val_score(lr, X_train, y_train, cv=5)
end = time.time()
print(end-start)
print(scores)


1.65649986267
[0.72755906 0.75709779 0.74290221 0.75513428 0.78481013]


# L1 Regularization

In [142]:
from sklearn.linear_model import LogisticRegression
lr_l1 = LogisticRegression(penalty='l1')

import time

X_train, X_test, y_train, y_test  = train_test_split(
        features_nd, 
        data_labels,
        train_size=0.80, test_size=0.20, 
        random_state=1234)

start = time.time()
lr_l1.fit(X_train,y_train)
scores = cross_val_score(lr_l1, X_train, y_train, cv=5)
end = time.time()
print(end-start)
print(scores)


3.00942206383
[0.77952756 0.81230284 0.82938389 0.83254344 0.82148499]


# L2 Regularization

In [141]:
from sklearn.linear_model import LogisticRegression
lr_l2 = LogisticRegression(penalty='l2')

import time

X_train, X_test, y_train, y_test  = train_test_split(
        features_nd, 
        data_labels,
        train_size=0.80, test_size=0.20, 
        random_state=1234)

start = time.time()
lr_l2.fit(X_train,y_train)
scores = cross_val_score(lr_l2, X_train, y_train, cv=5)
end = time.time()
print(end-start)
print(scores)


1.42507004738
[0.70866142 0.76971609 0.77093207 0.7535545  0.73459716]


It can be seen that using L1 regularization (equivalent to Lasso) gives slightly better performance due to the high number of training features in the dataset

# Support Vector Classification

In [50]:
from sklearn.svm import SVC
svc = SVC()
start = time.time()
scores = cross_val_score(svc, X_train, y_train, cv=5)
svc.fit(X_train,y_train)
end = time.time()
print(end-start)
print(scores)

756.288990974
[0.32598425 0.32492114 0.32492114 0.32543444 0.32594937]


It is seen that the accuracy of SVC is the lowest while the training time for SVC is the highest. It can be seen that the L1 regularization increases the accuracy of the Logistic Regression while increasing the n_estimators increases the accuracy of the Random Forest classifier. The time taken by Logist

# 4) Regression models.


# Lasso Regression

Note: Linear Regression gave very high MSE.

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
import time


lasso_anger = Lasso()
lasso_fear = Lasso()
lasso_sadness = Lasso()
lasso_joy = Lasso()

start = time.time()

X_train, X_test, y_train, y_test  = train_test_split(
        features_anger, 
        data_int_anger,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
lasso_anger.fit(X_train,y_train)
y_pred = lasso_anger.predict(X_test)
print("Lasso anger MSE",mean_squared_error(y_test,y_pred))

X_train, X_test, y_train, y_test  = train_test_split(
        features_fear, 
        data_int_fear,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
lasso_fear.fit(X_train,y_train)
y_pred = lasso_fear.predict(X_test)
print("Lasso fear MSE",mean_squared_error(y_test,y_pred))

X_train, X_test, y_train, y_test  = train_test_split(
        features_sadness, 
        data_int_sadness,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
lasso_sadness.fit(X_train,y_train)
y_pred = lasso_sadness.predict(X_test)
print("Lasso sadness MSE",mean_squared_error(y_test,y_pred))

X_train, X_test, y_train, y_test  = train_test_split(
        features_joy, 
        data_int_joy,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
lasso_joy.fit(X_train,y_train)
y_pred = lasso_joy.predict(X_test)
print("Lasso joy MSE",mean_squared_error(y_test,y_pred))


end = time.time()
print(end-start)

('Lasso anger MSE', 0.02866396602717718)
('Lasso fear MSE', 0.04180730119399449)
('Lasso sadness MSE', 0.03753981642948891)
('Lasso joy MSE', 0.0422497288103531)
1.46816301346


# RandomForestRegressor

In [32]:
from sklearn.ensemble import RandomForestRegressor

rfr_anger = RandomForestRegressor(n_estimators = 5)
rfr_fear = RandomForestRegressor()
rfr_sadness = RandomForestRegressor()
rfr_joy = RandomForestRegressor()

start = time.time()

X_train, X_test, y_train, y_test  = train_test_split(
        features_anger, 
        data_int_anger,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
rfr_anger.fit(X_train,y_train)
y_pred = rfr_anger.predict(X_test)
print("RFR anger MSE",mean_squared_error(y_test,y_pred))
y_pred_train = rfr_anger.predict(X_train)
print("RFR anger train MSE",mean_squared_error(y_train,y_pred_train))

X_train, X_test, y_train, y_test  = train_test_split(
        features_fear, 
        data_int_fear,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
rfr_fear.fit(X_train,y_train)
y_pred = rfr_fear.predict(X_test)
print("RFR fear MSE",mean_squared_error(y_test,y_pred))
y_pred_train = rfr_fear.predict(X_train)
print("RFR fear train MSE",mean_squared_error(y_train,y_pred_train))

X_train, X_test, y_train, y_test  = train_test_split(
        features_sadness, 
        data_int_sadness,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
rfr_sadness.fit(X_train,y_train)
y_pred = rfr_sadness.predict(X_test)
print("RFR sadness MSE",mean_squared_error(y_test,y_pred))
y_pred_train = rfr_sadness.predict(X_train)
print("RFR sadness train MSE",mean_squared_error(y_train,y_pred_train))

X_train, X_test, y_train, y_test  = train_test_split(
        features_joy, 
        data_int_joy,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
rfr_joy.fit(X_train,y_train)
y_pred = rfr_joy.predict(X_test)
print("RFR joy MSE",mean_squared_error(y_test,y_pred))
y_pred_train = rfr_joy.predict(X_train)
print("RFR joy train MSE",mean_squared_error(y_train,y_pred_train))

end = time.time()
print(end-start)

('RFR anger MSE', 0.020250077671957675)
('RFR anger train MSE', 0.005179754574468085)
('RFR fear MSE', 0.02522518808173501)
('RFR fear train MSE', 0.0044942089547401875)
('RFR sadness MSE', 0.026548505813953488)
('RFR sadness train MSE', 0.004292203488372093)
('RFR joy MSE', 0.03518506908313167)
('RFR joy train MSE', 0.0057021593241565)
22.8726279736


# Support Vector Regression

In [33]:
from sklearn.svm import SVR

svr_anger = SVR()
svr_fear = SVR()
svr_sadness = SVR()
svr_joy = SVR()

start = time.time()

X_train, X_test, y_train, y_test  = train_test_split(
        features_anger, 
        data_int_anger,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
svr_anger.fit(X_train,y_train)
y_pred = svr_anger.predict(X_test)
print("SVR anger MSE",mean_squared_error(y_test,y_pred))
y_pred_train = svr_anger.predict(X_train)
print("SVR anger train MSE",mean_squared_error(y_train,y_pred_train))

X_train, X_test, y_train, y_test  = train_test_split(
        features_fear, 
        data_int_fear,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
svr_fear.fit(X_train,y_train)
y_pred = svr_fear.predict(X_test)
print("SVR fear MSE",mean_squared_error(y_test,y_pred))
y_pred_train = svr_fear.predict(X_train)
print("SVR fear train MSE",mean_squared_error(y_train,y_pred_train))

X_train, X_test, y_train, y_test  = train_test_split(
        features_sadness, 
        data_int_sadness,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
svr_sadness.fit(X_train,y_train)
y_pred = svr_sadness.predict(X_test)
print("SVR sadness MSE",mean_squared_error(y_test,y_pred))
y_pred_train = svr_sadness.predict(X_train)
print("SVR sadness train MSE",mean_squared_error(y_train,y_pred_train))

X_train, X_test, y_train, y_test  = train_test_split(
        features_joy, 
        data_int_joy,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
svr_joy.fit(X_train,y_train)
y_pred = svr_joy.predict(X_test)
print("SVR joy MSE",mean_squared_error(y_test,y_pred))
y_pred_train = svr_joy.predict(X_train)
print("SVR joy train MSE",mean_squared_error(y_train,y_pred_train))

end = time.time()
print(end-start)

('SVR anger MSE', 0.02862764165920558)
('SVR anger train MSE', 0.028026568051555557)
('SVR fear MSE', 0.04175611458489101)
('SVR fear train MSE', 0.03648529068366911)
('SVR sadness MSE', 0.037908530474595865)
('SVR sadness train MSE', 0.03559074540435568)
('SVR joy MSE', 0.042360697389192965)
('SVR joy train MSE', 0.04218901209859038)
50.182587862


# MLP

In [89]:
from sklearn.neural_network import MLPRegressor

mlp_anger = MLPRegressor(hidden_layer_sizes=(15,2 ))
mlp_fear = MLPRegressor(hidden_layer_sizes=(15,2 ))
mlp_sadness = MLPRegressor(hidden_layer_sizes=(15,2 ))
mlp_joy = MLPRegressor(hidden_layer_sizes=(15,2 ))

start = time.time()

X_train, X_test, y_train, y_test  = train_test_split(
        features_anger, 
        data_int_anger,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
mlp_anger.fit(X_train,y_train)
y_pred = mlp_anger.predict(X_test)
print("SVR anger MSE",mean_squared_error(y_test,y_pred))
y_pred_train = mlp_anger.predict(X_train)
print("SVR anger train MSE",mean_squared_error(y_train,y_pred_train))

X_train, X_test, y_train, y_test  = train_test_split(
        features_fear, 
        data_int_fear,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
mlp_fear.fit(X_train,y_train)
y_pred = mlp_fear.predict(X_test)
print("MLP fear MSE",mean_squared_error(y_test,y_pred))
y_pred_train = mlp_fear.predict(X_train)
print("MLP fear train MSE",mean_squared_error(y_train,y_pred_train))

X_train, X_test, y_train, y_test  = train_test_split(
        features_sadness, 
        data_int_sadness,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
mlp_sadness.fit(X_train,y_train)
y_pred = mlp_sadness.predict(X_test)
print("MLP sadness MSE",mean_squared_error(y_test,y_pred))
y_pred_train = mlp_sadness.predict(X_train)
print("MLP sadness train MSE",mean_squared_error(y_train,y_pred_train))

X_train, X_test, y_train, y_test  = train_test_split(
        features_joy, 
        data_int_joy,
        train_size=0.80, test_size=0.20, 
        random_state=1234)
mlp_joy.fit(X_train,y_train)
y_pred = mlp_joy.predict(X_test)
print("MLP joy MSE",mean_squared_error(y_test,y_pred))
y_pred_train = mlp_joy.predict(X_train)
print("MLP joy train MSE",mean_squared_error(y_train,y_pred_train))

end = time.time()
print(end-start)



('SVR anger MSE', 0.0946643222897825)
('SVR anger train MSE', 0.0936490618578413)
('MLP fear MSE', 0.06697699408464043)
('MLP fear train MSE', 0.008870261176981378)
('MLP sadness MSE', 0.2314787859402415)
('MLP sadness train MSE', 0.22167894835795152)
('MLP joy MSE', 0.0460406390297131)
('MLP joy train MSE', 0.04623777216197536)
39.8997709751


# Ridge Regression

In [61]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


ridge = Ridge()
start = time.time()
ridge.fit(X_train,y_train)

y_pred_train = ridge.predict(X_train)
y_pred = ridge.predict(X_test)
print("Training error is:", mean_squared_error(y_train,y_pred_train))
print("Testing error is:", mean_squared_error(y_test,y_pred))

end = time.time()

('Training error is:', 0.002253079001659997)
('Testing error is:', 0.030640771163033063)


# User Defined Function

In [74]:
def tg(str):
    b = vectorizer.transform(str).toarray()
    a = lr.predict(b)
    intensity = []
    for i in range(0,len(a)):
        if a[i] == 'sadness':
            intensity.append(rfr_sadness.predict([b[i]])[0])
        if a[i] == 'joy':
            intensity.append(rfr_joy.predict([b[i]])[0])
        if a[i] == 'fear':
            intensity.append(rfr_fear.predict([b[i]])[0])
        if a[i] == 'anger':
            intensity.append(rfr_anger.predict([b[i]])[0])
    
    features_anger = []
    features_joy = []
    features_sadness = []
    features_fear = []
    return a,intensity

In [76]:
a = ["I am sad ", "I love you", "He missed so many goals", "He killed her"]
x = tg(a)
x

(array(['sadness', 'joy', 'fear', 'anger'], dtype='|S7'),
 [0.6727000000000001, 0.6751, 0.4449000000000001, 0.48739999999999994])

# Test Set Predictions

In [143]:
data = []
data_labels = []
data_int = []

dataset=pd.read_csv("testing_set/anger-ratings-0to1.test.gold.txt",delimiter="\t",names=['id','tweet','emotion','intensity'])
for i in range(len(dataset)):
    data.append(dataset.iat[i,1])
    data_labels.append('anger')
    data_int.append(dataset.iat[i,3])
    
dataset=pd.read_csv("testing_set/fear-ratings-0to1.test.gold.txt",delimiter="\t",names=['id','tweet','emotion','intensity'])
for i in range(len(dataset)):
    data.append(dataset.iat[i,1])
    data_labels.append('fear')
    data_int.append(dataset.iat[i,3])

dataset=pd.read_csv("testing_set/joy-ratings-0to1.test.gold.txt",delimiter="\t",names=['id','tweet','emotion','intensity'])
for i in range(len(dataset)):
    data.append(dataset.iat[i,1])
    data_labels.append('joy')
    data_int.append(dataset.iat[i,3])

dataset=pd.read_csv("testing_set/sadness-ratings-0to1.test.gold.txt",delimiter="\t",names=['id','tweet','emotion','intensity'])
for i in range(len(dataset)):
    data.append(dataset.iat[i,1])
    data_labels.append('sadness')
    data_int.append(dataset.iat[i,3])
    
from random import shuffle
dv = []
dl = []
di = []
index_shuf = list(range(len(data)))
shuffle(index_shuf)
for i in index_shuf:
    dv.append(data[i])
    dl.append(data_labels[i])
    di.append(data_int[i])
data = dv
data_labels = dl
data_int = di


features = vectorizer.transform(
    data
)
test = features.toarray() # for easy usage
    

In [144]:
a = lr_l1.predict(test)
intensity = []
for i in range(0,len(a)):
    if a[i] == 'sadness':
        intensity.append(rfr_sadness.predict([test[i]])[0])
    if a[i] == 'joy':
        intensity.append(rfr_joy.predict([test[i]])[0])
    if a[i] == 'fear':
        intensity.append(rfr_fear.predict([test[i]])[0])
    if a[i] == 'anger':
        intensity.append(rfr_anger.predict([test[i]])[0])
        
        
print("Mean Squared error on test set = ",mean_squared_error(intensity,data_int))
print("Classification accuracy on test set = ",lr.score(test,data_labels))

('Mean Squared error on test set = ', 0.03277340866078876)
('Classification accuracy on test set = ', 0.7527052832590706)


In [7]:
dataset.iloc[4,1]

"Don't join @BTCare they put the phone down on you, talk over you and are rude. Taking money out of my acc willynilly! #fuming"