In [1]:
import pandas as pd

In [2]:
data_tweet = pd.read_csv (r"C:\Users\vira_\Downloads\tweet_emotions.csv")

In [3]:
data_tweet["sentiment"] = data_tweet["sentiment"].apply(lambda x: x if x in ['neutral', 'worry', 'happiness', 'sadness', 'love'] 
                                               else 'other')

### Preprocessing

In [4]:
import re
from nltk.tokenize import TweetTokenizer
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()

In [5]:
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = re.sub(r'\W', ' ', str(text))
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    lemma_text = [stemmer.lemmatize(word) for word in tokens]
    stops = set(stopwords.words("english"))
    no_stop_text = [word for word in lemma_text if word not in stops] 
    clean_text = ' '.join(no_stop_text)
    return clean_text

In [6]:
clean_tweets = [preprocess_text(sentence) for sentence in data_tweet["content"]]

### Vectorization

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tfidf_vect = TfidfVectorizer(stop_words="english")

X = tfidf_vect.fit_transform(clean_tweets) # очищенные тексты

In [9]:
tfidf_vect = TfidfVectorizer(stop_words="english")

X_no_prep = tfidf_vect.fit_transform(data_tweet["content"]) # неочищенные тексты

In [10]:
y = data_tweet["sentiment"]

### Training

In [11]:
from sklearn.model_selection import train_test_split # с препроцессингом

def split_train_test(data_tweet, test_size=0.3, shuffle_state=True):
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train sentiments")
    print(y_train.value_counts())
    print("Value counts for Test sentiments")
    print(y_test.value_counts())
    print(type(X_train))
    print(type(y_train))
    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = split_train_test(data_tweet)

Value counts for Train sentiments
neutral      6094
other        6006
worry        5995
happiness    3619
sadness      3605
love         2681
Name: sentiment, dtype: int64
Value counts for Test sentiments
other        2681
neutral      2544
worry        2464
happiness    1590
sadness      1560
love         1161
Name: sentiment, dtype: int64
<class 'scipy.sparse.csr.csr_matrix'>
<class 'pandas.core.series.Series'>


In [12]:
from sklearn.model_selection import train_test_split # без препроцессинга

def split_train_test(data_tweet, test_size=0.3, shuffle_state=True):
    X_train1, X_test1, y_train1, y_test1 = train_test_split(X_no_prep, y, 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train sentiments")
    print(y_train1.value_counts())
    print("Value counts for Test sentiments")
    print(y_test1.value_counts())
    print(type(X_train1))
    print(type(y_train1))
    return X_train1, X_test1, y_train1, y_test1


X_train1, X_test1, y_train1, y_test1 = split_train_test(data_tweet)

Value counts for Train sentiments
neutral      6094
other        6006
worry        5995
happiness    3619
sadness      3605
love         2681
Name: sentiment, dtype: int64
Value counts for Test sentiments
other        2681
neutral      2544
worry        2464
happiness    1590
sadness      1560
love         1161
Name: sentiment, dtype: int64
<class 'scipy.sparse.csr.csr_matrix'>
<class 'pandas.core.series.Series'>


### Исправление дисбаланса классов

In [13]:
import imblearn

In [14]:
from imblearn.over_sampling import SMOTE

In [15]:
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel()) # с препроцессингом

In [16]:
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))

print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))



print("After OverSampling, counts of label 'neutral': {}".format(sum(y_train_res == 'neutral')))

print("After OverSampling, counts of label 'other': {} \n".format(sum(y_train_res == 'other')))

print("After OverSampling, counts of label 'worry': {}".format(sum(y_train_res == 'worry')))

print("After OverSampling, counts of label 'happiness': {} \n".format(sum(y_train_res == 'happiness')))

print("After OverSampling, counts of label 'sadness': {}".format(sum(y_train_res == 'sadness')))

print("After OverSampling, counts of label 'love': {} \n".format(sum(y_train_res == 'love')))

After OverSampling, the shape of train_X: (36564, 45484)
After OverSampling, the shape of train_y: (36564,) 

After OverSampling, counts of label 'neutral': 6094
After OverSampling, counts of label 'other': 6094 

After OverSampling, counts of label 'worry': 6094
After OverSampling, counts of label 'happiness': 6094 

After OverSampling, counts of label 'sadness': 6094
After OverSampling, counts of label 'love': 6094 



In [17]:
sm = SMOTE(random_state = 2)
X_train_res1, y_train_res1 = sm.fit_resample(X_train1, y_train1.ravel()) # без препроцессинга

### Naive Bayes

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn import metrics
clf = MultinomialNB(alpha = 2.2).fit(X_train_res, y_train_res) # с препроцессингом
y_predicted = clf.predict(X_test) 
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

   happiness       0.27      0.38      0.32      1590
        love       0.29      0.56      0.38      1161
     neutral       0.38      0.14      0.21      2544
       other       0.30      0.20      0.24      2681
     sadness       0.27      0.35      0.31      1560
       worry       0.34      0.39      0.36      2464

    accuracy                           0.30     12000
   macro avg       0.31      0.34      0.30     12000
weighted avg       0.32      0.30      0.29     12000



In [53]:
m_confusion_test = metrics.confusion_matrix(y_test, y_predicted)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,611,425,93,211,79,171
Actual love,223,647,41,89,72,89
Actual neutral,472,355,360,410,383,564
Actual other,517,422,205,526,419,592
Actual sadness,148,148,78,196,552,438
Actual worry,285,246,160,303,519,951


### Decision Tree

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [26]:
tree_clf = DecisionTreeClassifier().fit(X_train_res1, y_train_res1) # без препроцессинга
tree_y_pred = tree_clf.predict(X_test1)

In [27]:
print(classification_report(y_test1, tree_y_pred, zero_division=0))

              precision    recall  f1-score   support

   happiness       0.26      0.23      0.24      1590
        love       0.38      0.36      0.37      1161
     neutral       0.33      0.41      0.37      2544
       other       0.28      0.26      0.27      2681
     sadness       0.26      0.24      0.25      1560
       worry       0.28      0.28      0.28      2464

    accuracy                           0.30     12000
   macro avg       0.30      0.29      0.29     12000
weighted avg       0.29      0.30      0.29     12000



In [54]:
m_confusion_test = metrics.confusion_matrix(y_test1, tree_y_pred)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,408,194,308,356,109,215
Actual love,224,427,174,165,69,102
Actual neutral,271,116,1061,454,203,439
Actual other,363,188,678,680,279,493
Actual sadness,116,85,263,264,406,426
Actual worry,210,128,580,434,419,693


In [56]:
tree_clf = DecisionTreeClassifier().fit(X_train_res, y_train_res) # с препроцессингом
tree_y_pred1 = tree_clf.predict(X_test)

In [57]:
print(classification_report(y_test, tree_y_pred1, zero_division=0))

              precision    recall  f1-score   support

   happiness       0.26      0.27      0.27      1590
        love       0.38      0.37      0.37      1161
     neutral       0.34      0.40      0.37      2544
       other       0.28      0.25      0.27      2681
     sadness       0.27      0.26      0.26      1560
       worry       0.29      0.29      0.29      2464

    accuracy                           0.30     12000
   macro avg       0.30      0.30      0.30     12000
weighted avg       0.30      0.30      0.30     12000



In [58]:
m_confusion_test = metrics.confusion_matrix(y_test, tree_y_pred1)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,424,186,296,367,105,212
Actual love,221,427,162,162,76,113
Actual neutral,277,132,1022,445,220,448
Actual other,370,187,676,668,285,495
Actual sadness,120,83,246,253,402,456
Actual worry,195,123,562,456,422,706


### Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
rf = RandomForestClassifier(n_estimators=100, max_depth=20)
rf.fit(X_train_res1, y_train_res1) # без препроцессинга

preds = rf.predict(X_test1)

print(classification_report(y_test1, preds, zero_division=0))

              precision    recall  f1-score   support

   happiness       0.30      0.34      0.32      1590
        love       0.40      0.48      0.44      1161
     neutral       0.32      0.56      0.41      2544
       other       0.31      0.12      0.18      2681
     sadness       0.32      0.33      0.32      1560
       worry       0.35      0.23      0.28      2464

    accuracy                           0.33     12000
   macro avg       0.33      0.35      0.32     12000
weighted avg       0.33      0.33      0.31     12000



In [59]:
m_confusion_test = metrics.confusion_matrix(y_test1, preds)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,537,265,451,144,85,108
Actual love,225,561,214,48,62,51
Actual neutral,269,148,1429,197,188,313
Actual other,449,209,1054,334,318,317
Actual sadness,95,99,443,118,518,287
Actual worry,214,128,846,232,469,575


### Support Vector Machines

In [33]:
from sklearn.svm import SVC

In [35]:
model = SVC()
model.fit(X_train_res, y_train_res) # с препроцессингом

In [36]:
pred_svm = model.predict(X_test)
print(classification_report(pred_svm, y_test, zero_division=0))

              precision    recall  f1-score   support

   happiness       0.21      0.34      0.26      1000
        love       0.37      0.50      0.42       846
     neutral       0.54      0.33      0.41      4199
       other       0.30      0.30      0.30      2656
     sadness       0.18      0.39      0.24       714
       worry       0.37      0.35      0.36      2585

    accuracy                           0.34     12000
   macro avg       0.33      0.37      0.33     12000
weighted avg       0.39      0.34      0.35     12000



In [60]:
m_confusion_test = metrics.confusion_matrix(y_test, pred_svm)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,352,130,497,422,26,163
Actual love,174,407,267,194,32,87
Actual neutral,149,55,1362,475,83,420
Actual other,231,94,944,827,87,498
Actual sadness,47,54,382,297,270,510
Actual worry,87,60,729,459,202,927


In [37]:
model = SVC()
model.fit(X_train_res1, y_train_res1) # без препроцессинга

In [61]:
pred_svm1 = model.predict(X_test1)
print(classification_report(pred_svm1, y_test1, zero_division=0))

              precision    recall  f1-score   support

   happiness       0.22      0.34      0.27      1040
        love       0.35      0.51      0.42       800
     neutral       0.54      0.33      0.41      4181
       other       0.31      0.31      0.31      2674
     sadness       0.17      0.39      0.24       700
       worry       0.38      0.36      0.37      2605

    accuracy                           0.35     12000
   macro avg       0.33      0.37      0.33     12000
weighted avg       0.39      0.35      0.35     12000



In [62]:
m_confusion_test = metrics.confusion_matrix(y_test, pred_svm1)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,352,130,497,422,26,163
Actual love,174,407,267,194,32,87
Actual neutral,149,55,1362,475,83,420
Actual other,231,94,944,827,87,498
Actual sadness,47,54,382,297,270,510
Actual worry,87,60,729,459,202,927


### Gradient Boosting

In [39]:
from sklearn.ensemble import GradientBoostingClassifier

In [40]:
gb_clf = GradientBoostingClassifier(n_iter_no_change=5, verbose=10)

In [41]:
gb_clf.fit(X_train_res, y_train_res) # с препроцессингом

      Iter       Train Loss   Remaining Time 
         1           1.7610           15.93m
         2           1.7384           15.42m
         3           1.7212           14.88m
         4           1.7074           14.44m
         5           1.6958           14.12m
         6           1.6859           13.90m
         7           1.6774           13.68m
         8           1.6694           13.45m
         9           1.6624           13.24m
        10           1.6560           13.04m
        11           1.6504           12.86m
        12           1.6454           12.68m
        13           1.6406           12.52m
        14           1.6361           12.35m
        15           1.6320           12.19m
        16           1.6276           12.02m
        17           1.6234           11.86m
        18           1.6195           11.70m
        19           1.6155           11.55m
        20           1.6121           11.40m
        21           1.6087           11.24m
        2

In [42]:
gb_y_pred = gb_clf.predict(X_test)

In [43]:
print(classification_report(y_test, gb_y_pred))

              precision    recall  f1-score   support

   happiness       0.31      0.35      0.32      1590
        love       0.45      0.43      0.44      1161
     neutral       0.32      0.61      0.42      2544
       other       0.31      0.19      0.23      2681
     sadness       0.35      0.28      0.31      1560
       worry       0.38      0.22      0.28      2464

    accuracy                           0.34     12000
   macro avg       0.35      0.35      0.34     12000
weighted avg       0.35      0.34      0.33     12000



In [63]:
m_confusion_test = metrics.confusion_matrix(y_test, gb_y_pred)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,550,202,472,210,60,96
Actual love,222,514,248,84,59,34
Actual neutral,258,94,1474,306,171,241
Actual other,417,156,1122,483,228,275
Actual sadness,100,63,459,197,464,277
Actual worry,205,86,860,343,407,563


In [44]:
gb_clf.fit(X_train_res1, y_train_res1) # без препроцессинга

      Iter       Train Loss   Remaining Time 
         1           1.7605           15.22m
         2           1.7379           14.74m
         3           1.7206           14.63m
         4           1.7066           14.40m
         5           1.6947           14.16m
         6           1.6848           13.97m
         7           1.6761           13.81m
         8           1.6682           13.64m
         9           1.6616           13.46m
        10           1.6551           13.28m
        11           1.6491           13.11m
        12           1.6437           12.94m
        13           1.6388           12.79m
        14           1.6344           12.63m
        15           1.6301           12.47m
        16           1.6254           12.33m
        17           1.6213           12.18m
        18           1.6171           12.02m
        19           1.6133           11.87m
        20           1.6096           11.71m
        21           1.6060           11.57m
        2

In [64]:
gb_y_pred1 = gb_clf.predict(X_test1)

In [65]:
print(classification_report(y_test, gb_y_pred1))

              precision    recall  f1-score   support

   happiness       0.31      0.35      0.33      1590
        love       0.46      0.44      0.45      1161
     neutral       0.32      0.58      0.41      2544
       other       0.30      0.18      0.22      2681
     sadness       0.33      0.30      0.31      1560
       worry       0.38      0.23      0.29      2464

    accuracy                           0.34     12000
   macro avg       0.35      0.35      0.34     12000
weighted avg       0.34      0.34      0.32     12000



In [66]:
m_confusion_test = metrics.confusion_matrix(y_test, gb_y_pred1)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,550,202,472,210,60,96
Actual love,222,514,248,84,59,34
Actual neutral,258,94,1474,306,171,241
Actual other,417,156,1122,483,228,275
Actual sadness,100,63,459,197,464,277
Actual worry,205,86,860,343,407,563


### K-nearest Neighbors

In [47]:
from sklearn.neighbors import KNeighborsClassifier

In [48]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_res1, y_train_res1) # без препроцессинга
knn_y_pred = knn_clf.predict(X_test1)

In [49]:
print(classification_report(y_test1, knn_y_pred))

              precision    recall  f1-score   support

   happiness       0.21      0.05      0.08      1590
        love       0.39      0.13      0.19      1161
     neutral       0.31      0.28      0.29      2544
       other       0.23      0.18      0.20      2681
     sadness       0.16      0.68      0.26      1560
       worry       0.22      0.02      0.03      2464

    accuracy                           0.21     12000
   macro avg       0.25      0.22      0.18     12000
weighted avg       0.25      0.21      0.18     12000



In [67]:
m_confusion_test = metrics.confusion_matrix(y_test, knn_y_pred)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,444,76,71,40,953,6
Actual love,304,152,49,29,622,5
Actual neutral,415,53,301,115,1623,37
Actual other,463,68,130,104,1900,16
Actual sadness,151,27,50,32,1280,20
Actual worry,290,43,120,67,1905,39


In [68]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_res, y_train_res) # с препроцессингом
knn_y_pred1 = knn_clf.predict(X_test)

In [69]:
print(classification_report(y_test, knn_y_pred1))

              precision    recall  f1-score   support

   happiness       0.21      0.28      0.24      1590
        love       0.36      0.13      0.19      1161
     neutral       0.42      0.12      0.18      2544
       other       0.27      0.04      0.07      2681
     sadness       0.15      0.82      0.26      1560
       worry       0.32      0.02      0.03      2464

    accuracy                           0.19     12000
   macro avg       0.29      0.23      0.16     12000
weighted avg       0.30      0.19      0.15     12000



In [70]:
m_confusion_test = metrics.confusion_matrix(y_test, knn_y_pred1)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,444,76,71,40,953,6
Actual love,304,152,49,29,622,5
Actual neutral,415,53,301,115,1623,37
Actual other,463,68,130,104,1900,16
Actual sadness,151,27,50,32,1280,20
Actual worry,290,43,120,67,1905,39
