In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report,confusion_matrix

import Preprocess_gokhanEr as pp

In [2]:
path = "/Users/gokhanersoz/Desktop/Hepsi/NLP/Data/text_to_emotion.csv"

df = pd.read_csv(path)
df.head()

Unnamed: 0,text,emotion
0,i feel cold,ANGER
1,i feel the cold i can say he sends it,ANGER
2,i remember feeling like my blood had run cold ...,ANGER
3,i hate too is stepping outside in the cold and...,ANGER
4,i don't think i am anti social i just don't re...,ANGER


In [3]:
print("DataFrame Shape : {}".format(df.shape))

DataFrame Shape : (30000, 2)


In [4]:
df["emotion"].value_counts()

LOVE        5000
SADNESS     5000
SURPRISE    5000
JOY         5000
FEAR        5000
ANGER       5000
Name: emotion, dtype: int64

## Preprocessing And Cleaning

In [5]:
%%time
df["text"] = df["text"].apply(lambda words : pp.get_lower_convert(words))
df["text"] = df["text"].apply(lambda words : pp.cont_exp(words))

df["text"] = df["text"].apply(lambda words : pp.remove_special_chars(words))
df["text"] = df["text"].apply(lambda words : pp.remove_accented_chars(words))


#df["text"] = df["text"].apply(lambda words : pp.get_make_base(words))
#df["text"] = df["text"].apply(lambda words : " ".join(pp.spelling_correction(words).words))

CPU times: user 19.7 s, sys: 48.8 ms, total: 19.7 s
Wall time: 19.8 s


In [6]:
df.head()

Unnamed: 0,text,emotion
0,i feel cold,ANGER
1,i feel the cold i can say he sends it,ANGER
2,i remember feeling like my blood had run cold ...,ANGER
3,i hate too is stepping outside in the cold and...,ANGER
4,i do not think i am anti social i just do not ...,ANGER


## Load GloVe Vect

In [7]:
glove_vectors = dict()

In [9]:
file = open("/Users/gokhanersoz/Desktop/Hepsi/NLP/glove/glove.6B.100d.txt", encoding = "utf-8")

name = file.readline().split()[0]
vec = np.array(file.readline().split()[1:])

In [10]:
glove_vectors[name] = vec

In [11]:
glove_vectors["the"]

array(['-0.10767', '0.11053', '0.59812', '-0.54361', '0.67396', '0.10663',
       '0.038867', '0.35481', '0.06351', '-0.094189', '0.15786',
       '-0.81665', '0.14172', '0.21939', '0.58505', '-0.52158', '0.22783',
       '-0.16642', '-0.68228', '0.3587', '0.42568', '0.19021', '0.91963',
       '0.57555', '0.46185', '0.42363', '-0.095399', '-0.42749',
       '-0.16567', '-0.056842', '-0.29595', '0.26037', '-0.26606',
       '-0.070404', '-0.27662', '0.15821', '0.69825', '0.43081',
       '0.27952', '-0.45437', '-0.33801', '-0.58184', '0.22364',
       '-0.5778', '-0.26862', '-0.20425', '0.56394', '-0.58524',
       '-0.14365', '-0.64218', '0.0054697', '-0.35248', '0.16162',
       '1.1796', '-0.47674', '-2.7553', '-0.1321', '-0.047729', '1.0655',
       '1.1034', '-0.2208', '0.18669', '0.13177', '0.15117', '0.7131',
       '-0.35215', '0.91348', '0.61783', '0.70992', '0.23955', '-0.14571',
       '-0.37859', '-0.045959', '-0.47368', '0.2385', '0.20536',
       '-0.18996', '0.32507', '-

In [12]:
glove_vectors = dict()

file = open("/Users/gokhanersoz/Desktop/Hepsi/NLP/glove/glove.6B.100d.txt",encoding = "utf-8")

for line in file:
    
    values = line.split()
    
    word = values[0]
    vectors = np.asarray(values[1:])
    
    glove_vectors[word] = vectors
    
file.close()

In [13]:
glove_vectors["the"]

array(['-0.038194', '-0.24487', '0.72812', '-0.39961', '0.083172',
       '0.043953', '-0.39141', '0.3344', '-0.57545', '0.087459',
       '0.28787', '-0.06731', '0.30906', '-0.26384', '-0.13231',
       '-0.20757', '0.33395', '-0.33848', '-0.31743', '-0.48336',
       '0.1464', '-0.37304', '0.34577', '0.052041', '0.44946', '-0.46971',
       '0.02628', '-0.54155', '-0.15518', '-0.14107', '-0.039722',
       '0.28277', '0.14393', '0.23464', '-0.31021', '0.086173', '0.20397',
       '0.52624', '0.17164', '-0.082378', '-0.71787', '-0.41531',
       '0.20335', '-0.12763', '0.41367', '0.55187', '0.57908', '-0.33477',
       '-0.36559', '-0.54857', '-0.062892', '0.26584', '0.30205',
       '0.99775', '-0.80481', '-3.0243', '0.01254', '-0.36942', '2.2167',
       '0.72201', '-0.24978', '0.92136', '0.034514', '0.46745', '1.1079',
       '-0.19358', '-0.074575', '0.23353', '-0.052062', '-0.22044',
       '0.057162', '-0.15806', '-0.30798', '-0.41625', '0.37972',
       '0.15006', '-0.53212', '

## Text To Glove Vectors

In [14]:
x = "hi hello"

In [15]:
glove_vectors.get("hello").shape

(100,)

In [16]:
(glove_vectors.get("hi").astype("float")+glove_vectors.get("hello").astype("float") / 2).reshape(1,-1)

array([[ 0.27784   ,  0.43795   ,  1.27538   , -0.070965  , -0.41259   ,
        -0.743255  ,  0.237912  ,  0.46574   ,  0.48202425,  0.229122  ,
         0.66806   , -0.286536  , -0.04122   ,  0.1640975 ,  0.785125  ,
         0.66868   ,  0.49138985,  0.505145  ,  0.036476  ,  0.834905  ,
         0.089175  ,  0.708155  , -0.48026   ,  0.224255  ,  0.144025  ,
         1.14101   ,  0.280725  , -0.92531   ,  0.73256   ,  0.173997  ,
        -1.115053  ,  0.590494  ,  1.45507   ,  0.88835   ,  0.06493   ,
         0.137475  ,  0.21101   , -0.107585  , -0.25055   , -1.273985  ,
         0.9665505 , -0.323038  , -0.76747   , -0.14977   , -0.1156955 ,
        -0.76703   , -0.458     ,  0.860763  ,  0.264155  , -0.738937  ,
        -1.771745  ,  0.629305  , -0.082215  , -0.67072   , -0.8232    ,
        -0.821605  ,  0.848206  ,  1.765215  ,  0.47827   , -0.339605  ,
        -0.095926  ,  0.501245  , -1.32355   , -0.01518   , -0.20735   ,
         0.447905  ,  0.507855  ,  0.56158   ,  0.7

In [17]:
vec_shape = 100
x = "hi hello"

def get_vec(words,vec_shape):
    
    arr = np.zeros(vec_shape)
    #print(arr.shape)
    texts = str(words).split(" ")
    #print(len(text))
    
    for text in texts:
        
        try:
            
            vec = glove_vectors.get(text).astype(float)
            arr = arr + vec
            
        except:
            
            pass
    
    #print(arr.shape)
    arr = arr.reshape(1,-1)[0]
    #print(arr.shape)
    
    return arr / len(text)

In [18]:
x = "hi hello"

In [19]:
get_vec(x, vec_shape = 100)

array([ 0.082256  ,  0.127222  ,  0.316766  , -0.091644  , -0.092908  ,
       -0.121954  ,  0.0754624 ,  0.12414   ,  0.0969517 ,  0.0372988 ,
        0.207214  , -0.0671504 ,  0.046546  ,  0.029789  ,  0.190504  ,
        0.14783   ,  0.09757794,  0.133598  ,  0.0301972 ,  0.213538  ,
       -0.001696  ,  0.179122  , -0.167442  , -0.006924  ,  0.105844  ,
        0.337012  , -0.009866  , -0.201296  ,  0.237702  ,  0.0558454 ,
       -0.2182612 ,  0.2182888 ,  0.402344  ,  0.247764  ,  0.00429   ,
        0.075066  ,  0.058562  , -0.065986  , -0.00542   , -0.348614  ,
        0.1946202 , -0.0560112 , -0.22095   ,  0.019708  , -0.0269218 ,
       -0.164444  , -0.120212  ,  0.1796132 ,  0.021304  , -0.1571648 ,
       -0.411418  ,  0.192726  ,  0.028864  , -0.168298  , -0.2363    ,
       -0.239594  ,  0.1771624 ,  0.410946  ,  0.083744  , -0.0793    ,
       -0.0292112 ,  0.17159   , -0.38045   , -0.077062  , -0.001018  ,
        0.107604  ,  0.12302   ,  0.149954  ,  0.166804  , -0.02

In [20]:
df["vec_text"] = df["text"].apply(lambda words : get_vec(words,vec_shape=100))
df.head()

Unnamed: 0,text,emotion,vec_text
0,i feel cold,ANGER,"[-0.200256, 0.49751749999999995, 0.550245, -0...."
1,i feel the cold i can say he sends it,ANGER,"[-1.1038044999999999, 2.2412535, 3.30570499999..."
2,i remember feeling like my blood had run cold ...,ANGER,"[0.25489498, 1.2009659999999998, 1.4476218, -1..."
3,i hate too is stepping outside in the cold and...,ANGER,"[-0.5458101142857142, 0.7955664857142857, 1.49..."
4,i do not think i am anti social i just do not ...,ANGER,"[-0.8428452399999999, 2.6651884, 3.37335018, -..."


## ML Model

In [21]:
y = df["emotion"]

In [22]:
X = []

for vec in df["vec_text"].values:
    X.append(vec)
    
X = np.array(X)
X.shape

(30000, 100)

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X, y , random_state = 0, test_size = 0.2, stratify = y)

In [24]:
X_train.shape,X_test.shape

((24000, 100), (6000, 100))

In [25]:
def run_fit(classification, X, y ):
    
    clf = classification.fit(X,y)
    
    return clf

In [26]:
logistic = run_fit(LogisticRegression(solver = "liblinear", multi_class="auto"), X_train, y_train)
svc = run_fit(LinearSVC(), X_train, y_train)



In [27]:
y_pred_svc = svc.predict(X_test)
y_pred_logistic = logistic.predict(X_test)

In [28]:
print(classification_report(y_test,y_pred_logistic))

              precision    recall  f1-score   support

       ANGER       0.71      0.68      0.69      1000
        FEAR       0.73      0.71      0.72      1000
         JOY       0.74      0.76      0.75      1000
        LOVE       0.83      0.83      0.83      1000
     SADNESS       0.76      0.76      0.76      1000
    SURPRISE       0.76      0.80      0.78      1000

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.76      0.76      0.76      6000



In [29]:
print(confusion_matrix(y_test,y_pred_logistic))

[[684  83  71  30  78  54]
 [ 76 713  55  36  55  65]
 [ 65  36 756  43  44  56]
 [ 34  34  31 831  27  43]
 [ 67  68  50  26 756  33]
 [ 43  37  57  33  32 798]]


----

In [30]:
print(classification_report(y_test,y_pred_svc))

              precision    recall  f1-score   support

       ANGER       0.77      0.61      0.68      1000
        FEAR       0.61      0.80      0.69      1000
         JOY       0.72      0.77      0.74      1000
        LOVE       0.82      0.82      0.82      1000
     SADNESS       0.80      0.70      0.75      1000
    SURPRISE       0.81      0.77      0.79      1000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



In [31]:
print(confusion_matrix(y_test,y_pred_svc))

[[607 160  88  36  69  40]
 [ 39 801  52  25  37  46]
 [ 50  62 766  48  31  43]
 [ 24  70  33 825  18  30]
 [ 48 131  60  31 703  27]
 [ 21  90  62  37  21 769]]


## Predict Text Emotion With Custom Data

In [32]:
import pickle

#pickle.dump(logistic, open("logistic_glove.pkl","wb"))
#emotion = pickle.load(open("logistic_glove.pkl","rb"))
#emotion

In [33]:
def get_pred(words):
    
    words = pp.get_lower_convert(words)
    words = pp.cont_exp(words)
    
    words = pp.remove_special_chars(words)
    words = pp.remove_accented_chars(words)
    vec = get_vec(words,vec_shape=100).reshape(1,-1)
    
    return vec

In [34]:
x = "i am so happy. thanks a lot"
get_pred(x).shape

(1, 100)

In [35]:
logistic.predict(get_pred(x))[0]

'JOY'

In [36]:
svc.predict(get_pred(x))[0]

'JOY'