In [1]:
import pandas as pd

In [2]:
data_tweet = pd.read_csv (r"C:\Users\vira_\Downloads\tweet_emotions.csv")

In [3]:
data_tweet.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [4]:
data_tweet["sentiment"] = data_tweet["sentiment"].apply(lambda x: x if x in ['neutral', 'worry', 'happiness', 'sadness', 'love'] 
                                               else 'other')

### Preprocessing

In [5]:
import re
from nltk.tokenize import TweetTokenizer
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()

In [6]:
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = re.sub(r'\W', ' ', str(text))
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    lemma_text = [stemmer.lemmatize(word) for word in tokens]
    stops = set(stopwords.words("english"))
    no_stop_text = [word for word in lemma_text if word not in stops] 
    clean_text = ' '.join(no_stop_text)
    return clean_text

In [7]:
data_tweet['content'] = data_tweet['content'].apply(preprocess_text)

In [9]:
# data_tweet.drop(['tweet_id'], axis=1, inplace=True)

In [8]:
tweet_tokenizer = TweetTokenizer()
tweets_tokens = [tweet_tokenizer.tokenize(x) for x in data_tweet['content']]

### Vectorization

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data_tweet['content'])

In [12]:
y = data_tweet["sentiment"]

### Training

In [15]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, stratify=data_tweet.sentiment)

In [13]:
from sklearn.model_selection import train_test_split

def split_train_test(data_tweet, test_size=0.3, shuffle_state=True):
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train sentiments")
    print(y_train.value_counts())
    print("Value counts for Test sentiments")
    print(y_test.value_counts())
    print(type(X_train))
    print(type(y_train))
    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = split_train_test(data_tweet)

Value counts for Train sentiments
neutral      6094
other        6006
worry        5995
happiness    3619
sadness      3605
love         2681
Name: sentiment, dtype: int64
Value counts for Test sentiments
other        2681
neutral      2544
worry        2464
happiness    1590
sadness      1560
love         1161
Name: sentiment, dtype: int64
<class 'scipy.sparse.csr.csr_matrix'>
<class 'pandas.core.series.Series'>


### Исправление дисбаланса классов

In [14]:
import imblearn

In [15]:
from imblearn.over_sampling import SMOTE

In [16]:
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

In [17]:
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))

print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))



print("After OverSampling, counts of label 'neutral': {}".format(sum(y_train_res == 'neutral')))

print("After OverSampling, counts of label 'other': {} \n".format(sum(y_train_res == 'other')))

print("After OverSampling, counts of label 'worry': {}".format(sum(y_train_res == 'worry')))

print("After OverSampling, counts of label 'happiness': {} \n".format(sum(y_train_res == 'happiness')))

print("After OverSampling, counts of label 'sadness': {}".format(sum(y_train_res == 'sadness')))

print("After OverSampling, counts of label 'love': {} \n".format(sum(y_train_res == 'love')))

After OverSampling, the shape of train_X: (36564, 45660)
After OverSampling, the shape of train_y: (36564,) 

After OverSampling, counts of label 'neutral': 6094
After OverSampling, counts of label 'other': 6094 

After OverSampling, counts of label 'worry': 6094
After OverSampling, counts of label 'happiness': 6094 

After OverSampling, counts of label 'sadness': 6094
After OverSampling, counts of label 'love': 6094 



### Naive Bayes

In [18]:
from sklearn.naive_bayes import MultinomialNB # на дисбалансе классов
from sklearn.metrics import classification_report
clf = MultinomialNB(alpha = 2.2).fit(X_train, y_train)
y_predicted = clf.predict(X_test)
# print("MultinomialNB Accuracy:", accuracy_score(y_test, y_predicted))
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

   happiness       0.36      0.11      0.17      1590
        love       0.57      0.22      0.31      1161
     neutral       0.36      0.25      0.29      2544
       other       0.28      0.43      0.34      2681
     sadness       0.47      0.05      0.10      1560
       worry       0.30      0.61      0.40      2464

    accuracy                           0.32     12000
   macro avg       0.39      0.28      0.27     12000
weighted avg       0.36      0.32      0.29     12000



In [34]:
from sklearn import metrics

In [35]:
m_confusion_test = metrics.confusion_matrix(y_test, y_predicted)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,387,176,154,538,36,299
Actual love,168,412,77,275,36,193
Actual neutral,244,161,473,664,159,843
Actual other,243,136,283,959,145,915
Actual sadness,47,45,98,347,296,727
Actual worry,112,85,215,497,218,1337


In [63]:
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import OneHotEncoder

In [64]:
y_predicted_train = clf.predict(X_train)
y_valid1 = y_test.values.reshape(-1,1)
y_valid_train1 = y_train.values.reshape(-1,1)
ypred1 = y_predicted.reshape(-1,1)
ypred_train1 = y_predicted_train.reshape(-1,1)
y_valid1 = pd.DataFrame(y_test)
y_valid_train1 = pd.DataFrame(y_train)
ypred1 = pd.DataFrame(y_predicted)
ypred_train1 = pd.DataFrame(y_predicted_train)


onehotencoder = OneHotEncoder()
y_valid1 = onehotencoder.fit_transform(y_valid1).toarray()
y_valid_train1 = onehotencoder.fit_transform(y_valid_train1).toarray()
ypred1 = onehotencoder.fit_transform(ypred1).toarray()
ypred_train1 = onehotencoder.fit_transform(ypred_train1).toarray()


n_classes = ypred1.shape[1]

In [65]:
from sklearn.metrics import log_loss

In [66]:
log_loss(y_valid1, ypred1)

23.41729039574945

In [36]:
from sklearn.naive_bayes import MultinomialNB # сбалансированные классы
from sklearn.metrics import classification_report
clf = MultinomialNB(alpha = 2.2).fit(X_train_res, y_train_res)
y_predicted1 = clf.predict(X_test)
# print("MultinomialNB Accuracy:", accuracy_score(y_test, y_predicted))
print(classification_report(y_test, y_predicted1))

              precision    recall  f1-score   support

   happiness       0.32      0.24      0.28      1590
        love       0.41      0.35      0.38      1161
     neutral       0.36      0.19      0.25      2544
       other       0.29      0.36      0.32      2681
     sadness       0.33      0.19      0.24      1560
       worry       0.31      0.54      0.39      2464

    accuracy                           0.32     12000
   macro avg       0.34      0.31      0.31     12000
weighted avg       0.33      0.32      0.31     12000



In [37]:
m_confusion_test = metrics.confusion_matrix(y_test, y_predicted1)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,387,176,154,538,36,299
Actual love,168,412,77,275,36,193
Actual neutral,244,161,473,664,159,843
Actual other,243,136,283,959,145,915
Actual sadness,47,45,98,347,296,727
Actual worry,112,85,215,497,218,1337


### Decision Tree

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [21]:
tree_clf = DecisionTreeClassifier().fit(X_train, y_train) # на дисбалансе классов
tree_y_pred = tree_clf.predict(X_test)

In [22]:
print(classification_report(y_test, tree_y_pred, zero_division=0))

              precision    recall  f1-score   support

   happiness       0.26      0.24      0.25      1590
        love       0.36      0.35      0.36      1161
     neutral       0.33      0.42      0.37      2544
       other       0.28      0.25      0.27      2681
     sadness       0.28      0.23      0.25      1560
       worry       0.28      0.29      0.28      2464

    accuracy                           0.30     12000
   macro avg       0.30      0.30      0.30     12000
weighted avg       0.30      0.30      0.30     12000



In [38]:
m_confusion_test = metrics.confusion_matrix(y_test, tree_y_pred)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,445,352,204,240,186,163
Actual love,225,507,107,117,118,87
Actual neutral,489,367,442,275,653,318
Actual other,546,381,372,378,635,369
Actual sadness,167,146,189,170,609,279
Actual worry,350,293,323,292,729,477


In [73]:
tree_y_pred_train = tree_clf.predict(X_train)
y_valid2 = y_test.values.reshape(-1,1)
y_valid_train2 = y_train.values.reshape(-1,1)
ypred2 = tree_y_pred.reshape(-1,1)
ypred_train2 = tree_y_pred_train.reshape(-1,1)
y_valid2 = pd.DataFrame(y_test)
y_valid_train2 = pd.DataFrame(y_train)
ypred2 = pd.DataFrame(tree_y_pred)
ypred_train2 = pd.DataFrame(tree_y_pred_train)


onehotencoder = OneHotEncoder()
y_valid2 = onehotencoder.fit_transform(y_valid2).toarray()
y_valid_train2 = onehotencoder.fit_transform(y_valid_train2).toarray()
ypred2 = onehotencoder.fit_transform(ypred2).toarray()
ypred_train2 = onehotencoder.fit_transform(ypred_train2).toarray()


n_classes = ypred2.shape[1]

In [74]:
log_loss(y_valid2, ypred2)

26.312791150189472

In [39]:
tree_clf = DecisionTreeClassifier().fit(X_train_res, y_train_res) # сбалансированные классы
tree_y_pred1 = tree_clf.predict(X_test)

In [40]:
print(classification_report(y_test, tree_y_pred1, zero_division=0))

              precision    recall  f1-score   support

   happiness       0.21      0.30      0.25      1590
        love       0.25      0.44      0.32      1161
     neutral       0.26      0.17      0.21      2544
       other       0.26      0.14      0.19      2681
     sadness       0.21      0.39      0.27      1560
       worry       0.27      0.18      0.22      2464

    accuracy                           0.24     12000
   macro avg       0.24      0.27      0.24     12000
weighted avg       0.25      0.24      0.23     12000



In [42]:
m_confusion_test = metrics.confusion_matrix(y_test, tree_y_pred1)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,471,355,207,224,171,162
Actual love,215,516,102,120,121,87
Actual neutral,488,373,429,288,658,308
Actual other,542,368,383,388,628,372
Actual sadness,164,147,179,188,606,276
Actual worry,345,296,327,303,739,454


### Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rf = RandomForestClassifier(n_estimators=100, max_depth=20) # на дисбалансе классов
rf.fit(X_train, y_train)

preds = rf.predict(X_test)

print(classification_report(y_test, preds, zero_division=0))

              precision    recall  f1-score   support

   happiness       0.54      0.00      0.01      1590
        love       0.63      0.06      0.11      1161
     neutral       0.28      0.74      0.41      2544
       other       0.29      0.26      0.27      2681
     sadness       0.00      0.00      0.00      1560
       worry       0.33      0.37      0.35      2464

    accuracy                           0.30     12000
   macro avg       0.35      0.24      0.19     12000
weighted avg       0.32      0.30      0.23     12000



In [43]:
m_confusion_test = metrics.confusion_matrix(y_test, preds)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,476,240,261,242,167,204
Actual love,195,549,110,96,118,93
Actual neutral,257,146,741,275,670,455
Actual other,358,198,518,488,575,544
Actual sadness,80,91,211,160,591,427
Actual worry,159,134,411,292,609,859


In [78]:
preds_train = rf.predict(X_train)
y_valid3 = y_test.values.reshape(-1,1)
y_valid_train3 = y_train.values.reshape(-1,1)
ypred3 = preds.reshape(-1,1)
ypred_train3 = preds_train.reshape(-1,1)
y_valid3 = pd.DataFrame(y_test)
y_valid_train3 = pd.DataFrame(y_train)
ypred3 = pd.DataFrame(preds)
ypred_train3 = pd.DataFrame(preds_train)


onehotencoder = OneHotEncoder()
y_valid3 = onehotencoder.fit_transform(y_valid3).toarray()
y_valid_train3 = onehotencoder.fit_transform(y_valid_train3).toarray()
ypred3 = onehotencoder.fit_transform(ypred3).toarray()
ypred_train3 = onehotencoder.fit_transform(ypred_train3).toarray()


n_classes = ypred3.shape[1]

In [79]:
log_loss(y_valid3, ypred3)

23.877807414348265

In [49]:
rf = RandomForestClassifier(n_estimators=100, max_depth=20) # сбалансированные классы
rf.fit(X_train_res, y_train_res)

preds1 = rf.predict(X_test)

print(classification_report(y_test, preds1, zero_division=0))

              precision    recall  f1-score   support

   happiness       0.29      0.33      0.31      1590
        love       0.41      0.48      0.44      1161
     neutral       0.32      0.25      0.28      2544
       other       0.31      0.17      0.22      2681
     sadness       0.21      0.37      0.27      1560
       worry       0.33      0.35      0.34      2464

    accuracy                           0.30     12000
   macro avg       0.31      0.33      0.31     12000
weighted avg       0.31      0.30      0.30     12000



In [50]:
m_confusion_test = metrics.confusion_matrix(y_test, preds1)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,524,235,244,191,192,204
Actual love,220,552,101,78,116,94
Actual neutral,338,152,646,267,665,476
Actual other,431,196,473,465,579,537
Actual sadness,100,89,176,190,582,423
Actual worry,197,127,362,293,613,872


### Support Vector Machines

In [28]:
from sklearn.svm import SVC

In [29]:
model = SVC()
model.fit(X_train, y_train) # на дисбалансе классов

In [30]:
pred_svm = model.predict(X_test)
print(classification_report(pred_svm, y_test, zero_division=0))

              precision    recall  f1-score   support

   happiness       0.24      0.35      0.28      1074
        love       0.36      0.54      0.43       773
     neutral       0.53      0.36      0.43      3820
       other       0.35      0.32      0.33      2987
     sadness       0.17      0.43      0.24       607
       worry       0.39      0.35      0.37      2739

    accuracy                           0.36     12000
   macro avg       0.34      0.39      0.35     12000
weighted avg       0.40      0.36      0.37     12000



In [46]:
m_confusion_test = metrics.confusion_matrix(y_test, pred_svm)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,429,133,250,523,68,187
Actual love,168,377,133,322,64,97
Actual neutral,372,95,699,534,423,421
Actual other,299,118,465,938,324,537
Actual sadness,68,49,194,337,410,502
Actual worry,158,71,387,515,448,885


In [80]:
pred_svm_train = model.predict(X_train)
y_valid4 = y_test.values.reshape(-1,1)
y_valid_train4 = y_train.values.reshape(-1,1)
ypred4 = pred_svm.reshape(-1,1)
ypred_train4 = pred_svm_train.reshape(-1,1)
y_valid4 = pd.DataFrame(y_test)
y_valid_train4 = pd.DataFrame(y_train)
ypred4 = pd.DataFrame(pred_svm)
ypred_train4 = pd.DataFrame(pred_svm_train)


onehotencoder = OneHotEncoder()
y_valid4 = onehotencoder.fit_transform(y_valid4).toarray()
y_valid_train4 = onehotencoder.fit_transform(y_valid_train4).toarray()
ypred4 = onehotencoder.fit_transform(ypred4).toarray()
ypred_train4 = onehotencoder.fit_transform(ypred_train4).toarray()


n_classes = ypred4.shape[1]

In [81]:
log_loss(y_valid4, ypred4)

23.77994754789602

In [31]:
model.fit(X_train_res, y_train_res) # сбалансированные классы

In [51]:
pred_svm1 = model.predict(X_test)
print(classification_report(pred_svm1, y_test, zero_division=0))

              precision    recall  f1-score   support

   happiness       0.27      0.29      0.28      1494
        love       0.32      0.45      0.38       843
     neutral       0.27      0.33      0.30      2128
       other       0.35      0.30      0.32      3169
     sadness       0.26      0.24      0.25      1737
       worry       0.36      0.34      0.35      2629

    accuracy                           0.31     12000
   macro avg       0.31      0.32      0.31     12000
weighted avg       0.31      0.31      0.31     12000



In [52]:
m_confusion_test = metrics.confusion_matrix(y_test, pred_svm1)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,429,133,250,523,68,187
Actual love,168,377,133,322,64,97
Actual neutral,372,95,699,534,423,421
Actual other,299,118,465,938,324,537
Actual sadness,68,49,194,337,410,502
Actual worry,158,71,387,515,448,885


### Gradient Boosting

In [56]:
from sklearn.ensemble import GradientBoostingClassifier

In [57]:
gb_clf = GradientBoostingClassifier(n_iter_no_change=5, verbose=10)

In [58]:
gb_clf.fit(X_train, y_train) # на дисбалансе классов

      Iter       Train Loss   Remaining Time 
         1           1.7245           13.49m
         2           1.7110           12.99m
         3           1.7008           12.70m
         4           1.6922           12.55m
         5           1.6855           12.41m
         6           1.6795           12.23m
         7           1.6741           12.09m
         8           1.6696           11.95m
         9           1.6653           11.80m
        10           1.6616           11.66m
        11           1.6580           11.55m
        12           1.6547           11.44m
        13           1.6515           11.30m
        14           1.6486           11.17m
        15           1.6459           11.05m
        16           1.6432           10.91m
        17           1.6409           10.78m
        18           1.6387           10.64m
        19           1.6362           10.51m
        20           1.6339           10.38m
        21           1.6318           10.26m
        2

In [59]:
gb_y_pred = gb_clf.predict(X_test)
print(classification_report(y_test, gb_y_pred))

              precision    recall  f1-score   support

   happiness       0.37      0.21      0.27      1590
        love       0.51      0.40      0.44      1161
     neutral       0.31      0.69      0.43      2544
       other       0.32      0.25      0.28      2681
     sadness       0.43      0.18      0.25      1560
       worry       0.37      0.27      0.31      2464

    accuracy                           0.35     12000
   macro avg       0.38      0.33      0.33     12000
weighted avg       0.37      0.35      0.33     12000



In [60]:
m_confusion_test = metrics.confusion_matrix(y_test, gb_y_pred)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

Unnamed: 0,Predicted happiness,Predicted love,Predicted neutral,Predicted other,Predicted sadness,Predicted worry
Actual happiness,337,147,636,322,22,126
Actual love,133,460,342,143,33,50
Actual neutral,108,64,1766,311,37,258
Actual other,199,112,1283,676,85,326
Actual sadness,42,52,580,257,277,352
Actual worry,89,73,1053,399,195,655


In [82]:
gb_y_pred_train = gb_clf.predict(X_train)
y_valid5 = y_test.values.reshape(-1,1)
y_valid_train5 = y_train.values.reshape(-1,1)
ypred5 = gb_y_pred.reshape(-1,1)
ypred_train5 = gb_y_pred_train.reshape(-1,1)
y_valid5 = pd.DataFrame(y_test)
y_valid_train5 = pd.DataFrame(y_train)
ypred5 = pd.DataFrame(gb_y_pred)
ypred_train5 = pd.DataFrame(gb_y_pred_train)


onehotencoder = OneHotEncoder()
y_valid5 = onehotencoder.fit_transform(y_valid5).toarray()
y_valid_train5 = onehotencoder.fit_transform(y_valid_train5).toarray()
ypred5 = onehotencoder.fit_transform(ypred5).toarray()
ypred_train5 = onehotencoder.fit_transform(ypred_train5).toarray()


n_classes = ypred5.shape[1]

In [83]:
log_loss(y_valid5, ypred5)

22.533673366312986

In [61]:
gb_clf.fit(X_train_res, y_train_res) # сбалансированные классы

      Iter       Train Loss   Remaining Time 
         1           1.7705           14.61m
         2           1.7552           14.44m
         3           1.7432           14.29m
         4           1.7338           14.16m
         5           1.7256           13.88m
         6           1.7191           13.85m
         7           1.7131           13.75m
         8           1.7078           13.58m
         9           1.7032           13.37m
        10           1.6988           13.16m
        11           1.6947           12.95m
        12           1.6911           12.75m
        13           1.6879           12.62m
        14           1.6848           12.52m
        15           1.6819           12.40m
        16           1.6792           12.29m
        17           1.6767           12.15m
        18           1.6744           12.01m
        19           1.6720           11.88m
        20           1.6698           11.72m
        21           1.6674           11.60m
        2

In [53]:
gb_y_pred1 = gb_clf.predict(X_test)

In [54]:
print(classification_report(y_test, gb_y_pred1))

              precision    recall  f1-score   support

   happiness       0.32      0.35      0.33      1590
        love       0.46      0.45      0.45      1161
     neutral       0.33      0.51      0.40      2544
       other       0.34      0.23      0.28      2681
     sadness       0.35      0.29      0.32      1560
       worry       0.36      0.29      0.32      2464

    accuracy                           0.35     12000
   macro avg       0.36      0.35      0.35     12000
weighted avg       0.35      0.35      0.34     12000



In [None]:
m_confusion_test = metrics.confusion_matrix(y_test, gb_y_pred1)


pd.DataFrame(data = m_confusion_test, columns = ['Predicted happiness', 'Predicted love', 'Predicted neutral', 
                                                 'Predicted other', 'Predicted sadness', 'Predicted worry'],
            index = ['Actual happiness', 'Actual love', 'Actual neutral', 
                                                 'Actual other', 'Actual sadness', 'Actual worry'])

### K-nearest Neighbors

In [86]:
from sklearn.neighbors import KNeighborsClassifier

In [87]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train) # на дисбалансе классов
knn_y_pred = knn_clf.predict(X_test)

In [88]:
print(classification_report(y_test, knn_y_pred))

              precision    recall  f1-score   support

   happiness       0.23      0.15      0.18      1590
        love       0.42      0.24      0.31      1161
     neutral       0.24      0.59      0.35      2544
       other       0.25      0.14      0.18      2681
     sadness       0.24      0.17      0.20      1560
       worry       0.26      0.15      0.19      2464

    accuracy                           0.25     12000
   macro avg       0.27      0.24      0.23     12000
weighted avg       0.26      0.25      0.23     12000



In [89]:
knn_y_train_pred = knn_clf.predict(X_train)
y_valid6 = y_test.values.reshape(-1,1)
y_valid_train6 = y_train.values.reshape(-1,1)
ypred6 = knn_y_pred.reshape(-1,1)
ypred_train6 = knn_y_train_pred.reshape(-1,1)
y_valid6 = pd.DataFrame(y_test)
y_valid_train6 = pd.DataFrame(y_train)
ypred6 = pd.DataFrame(knn_y_pred)
ypred_train6 = pd.DataFrame(knn_y_train_pred)


onehotencoder = OneHotEncoder()
y_valid6 = onehotencoder.fit_transform(y_valid6).toarray()
y_valid_train6 = onehotencoder.fit_transform(y_valid_train6).toarray()
ypred6 = onehotencoder.fit_transform(ypred6).toarray()
ypred_train6 = onehotencoder.fit_transform(ypred_train6).toarray()


n_classes = ypred6.shape[1]

In [90]:
log_loss(y_valid6, ypred6)

25.765927190603378

In [59]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_res, y_train_res) # сбалансированные классы
knn_y_pred = knn_clf.predict(X_test)

In [60]:
print(classification_report(y_test, knn_y_pred))

              precision    recall  f1-score   support

   happiness       0.16      0.41      0.24      1590
        love       0.17      0.58      0.26      1161
     neutral       0.27      0.03      0.05      2544
       other       0.27      0.01      0.02      2681
     sadness       0.19      0.45      0.27      1560
       worry       0.38      0.01      0.02      2464

    accuracy                           0.18     12000
   macro avg       0.24      0.25      0.14     12000
weighted avg       0.26      0.18      0.11     12000

