In [1]:
import pandas as pd
import numpy as np
# from sklearn.linear_model import LogisticRegression


In [2]:
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    # calculate the sigmoid of z
    h = 1/(1 + np.exp(-z))
    
    return h

In [3]:
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    Hint: you might want to print the cost to make sure that it is going down.
    '''
    
    m = len(x)
  
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x,theta)
        
        # get the sigmoid of z
        h = sigmoid(z)
        
        # calculate the cost function
        J = (-1/m)*(np.dot(y.T,np.log(h)) + np.dot((1-y).T,np.log(1-h)))
        
        # update the weights theta
        theta = theta - (alpha/m)*np.dot(x.T, h-y)
        
    J = float(J)
    return J, theta


#Frequency generating function
# def build_freqs(tweets, ys):
#     yslist = np.squeeze(ys).tolist()
    
#     freqs = {}
#     for y, tweet in zip(yslist, tweets):
#         for word in process_tweet(tweet):
#             pair = (word, y)
#             freqs[pair] = freqs.get(pair, 0) + 1
            
#     return freqs


In [4]:
df_train = pd.read_csv("Y:/GP/Datasets/Emotion datasets/archive/train.txt",delimiter=';',names=['text','label'])
df_val = pd.read_csv("Y:/GP/Datasets/Emotion datasets/archive/val.txt",delimiter=';',names=['text','label'])

In [5]:
df = pd.concat([df_train,df_val])
df.reset_index(inplace=True,drop=True)
print("Shape of the DataFrame:",df.shape)
df.sample(5)

Shape of the DataFrame: (18000, 2)


Unnamed: 0,text,label
6299,im feeling pretty smart,joy
16868,im sure he remembers what it feels like to hav...,joy
6386,i normally would call meaningless and stupid b...,joy
17789,im at work and hes at school most likely feeli...,sadness
6509,i feel burdened by my own expectations,sadness


In [6]:
def custom_encoder(df):
    df.replace(to_replace ="surprise", value =1, inplace=True)
    df.replace(to_replace ="love", value =1, inplace=True)
    df.replace(to_replace ="joy", value =1, inplace=True)
    df.replace(to_replace ="fear", value =0, inplace=True)
    df.replace(to_replace ="anger", value =0, inplace=True)
    df.replace(to_replace ="sadness", value =0, inplace=True)

custom_encoder(df['label'])
    

In [7]:
df.sample()

Unnamed: 0,text,label
8958,i didnt respond because i feel that some days ...,0


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(df['text'])


In [9]:
train_vectors.shape

(18000, 3669)

In [10]:
test_df =  pd.read_csv("Y:/GP/Datasets/Emotion datasets/archive/test.txt",delimiter=';',names=['text','label'])

In [11]:
X_test,y_test = test_df.text,test_df.label
#encode the labels into two classes , 0 and 1
custom_encoder(y_test)

#pre-processing of text
# test_corpus = text_transformation(X_test)
# #convert text data into vectors
# testdata = cv.transform(test_corpus)
# #predict the target
# predictions = rfc.predict(testdata)
test_vectors = vectorizer.transform(X_test)



In [12]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report,plot_confusion_matrix,ConfusionMatrixDisplay

In [13]:
# lr=LogisticRegression(max_iter=1500,tol=1e-09)
# lr.fit(train_vectors, df['label'])
# logisticPrediction=lr.predict(test_vectors)

In [14]:
from sklearn import svm
# from sklearn.metrics import classification_report# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
# t0 = time.time()
classifier_linear.fit(train_vectors, df['label'])
# t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
# t2 = time.time()
# time_linear_train = t1-t0
# time_linear_predict = t2-t1# results
# print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))


In [15]:
test_df.head()

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,0
1,im updating my blog because i feel shitty,0
2,i never make her separate from me because i do...,0
3,i left with my bouquet of red and yellow tulip...,1
4,i was feeling a little vain when i did this one,0


In [16]:
report = classification_report(y_test, prediction_linear, output_dict=True)
print('positive: ', report['1'])
print('negative: ', report['0'])

positive:  {'precision': 0.9737704918032787, 'recall': 0.9684782608695652, 'f1-score': 0.9711171662125341, 'support': 920}
negative:  {'precision': 0.9732718894009217, 'recall': 0.9777777777777777, 'f1-score': 0.9755196304849885, 'support': 1080}


In [17]:
# plot_confusion_matrix(y_test,prediction_linear)

In [18]:
# prediction_linear=logisticPrediction
acc_score = accuracy_score(y_test,prediction_linear)
pre_score = precision_score(y_test,prediction_linear)
rec_score = recall_score(y_test,prediction_linear)
print('Accuracy_score: ',acc_score)
print('Precision_score: ',pre_score)
print('Recall_score: ',rec_score)
print("-"*50)
cr = classification_report(y_test,prediction_linear)
print(cr)

Accuracy_score:  0.9735
Precision_score:  0.9737704918032787
Recall_score:  0.9684782608695652
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1080
           1       0.97      0.97      0.97       920

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000



SVM linear : 97.35 .

svm rbf : 96.85.

svm sigmoid : 97.05.

Logistic Regression: 95.7.
