In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score, confusion_matrix
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [5]:
df = pd.read_csv("twitter_tweets.csv", encoding='utf8')
corpus = df['Tweet text'].apply(lambda x : str(x).lower()).tolist()
y = df['Category']

In [6]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(corpus)

#зберегти ветори слів
pickle.dump(count_vect.vocabulary_, open("feature.pkl","wb"))

#перетворення векторів слів до TF IDF
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X_train_counts)

#зберегти TF-IDF
pickle.dump(tfidf_transformer, open("tfidf.pkl","wb"))

#розділити дані на зразки для навчання та тестування
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [7]:
def evaluate_classifier(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    kappa = cohen_kappa_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("Cohen's Kappa:", kappa)
    print("Confusion Matrix:")
    print(cm)

In [8]:
# Навчання та класифікація з використанням Multinomial Naive Bayes
nb_classifier = MultinomialNB()
params_nb = {'alpha': [0.1, 0.5, 1.0]}
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
grid_search_nb = GridSearchCV(nb_classifier, params_nb, cv=skf, scoring='accuracy')
grid_search_nb.fit(X_train, y_train)
best_params_nb = grid_search_nb.best_params_
print("Multinomial Naive Bayes:")
print("Best parameters:", best_params_nb)
evaluate_classifier(grid_search_nb, X_test, y_test)

Multinomial Naive Bayes:
Best parameters: {'alpha': 1.0}
Accuracy: 0.7689075630252101
Precision: 0.8341707578719617
Recall: 0.7689075630252101
F1-Score: 0.7638538836406742
Cohen's Kappa: 0.7422835176626356
Confusion Matrix:
[[36  3  2  1  9  2  0  5  2  0]
 [ 0 35  0  0  2  0  0  7  0  0]
 [ 0  0 43  0  1  0  0  3  1  1]
 [ 0  2  0 31  5  0  0 11  2  1]
 [ 0  1  0  0 51  0  0  2  1  0]
 [ 0  1  0  0  0 49  0  3  2  1]
 [ 0  6  3  0  1  0  4 10  0  2]
 [ 0  0  0  0  1  1  0 41  0  0]
 [ 0  2  1  1  0  0  0  2 36  0]
 [ 0  1  0  0  1  0  0  6  1 40]]


In [9]:
# Навчання та класифікація з використанням Support Vector Machines
svm_classifier = SVC()
params_svm = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01, 0.001], 'kernel': ['rbf', 'linear']}
grid_search_svm = GridSearchCV(svm_classifier, params_svm, cv=skf, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)
best_params_svm = grid_search_svm.best_params_
print("\nSupport Vector Machines:")
print("Best parameters:", best_params_svm)
evaluate_classifier(grid_search_svm, X_test, y_test)


Support Vector Machines:
Best parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'linear'}
Accuracy: 0.9327731092436975
Precision: 0.9361177949899383
Recall: 0.9327731092436975
F1-Score: 0.933138313922473
Cohen's Kappa: 0.9250330982414867
Confusion Matrix:
[[53  2  0  0  3  0  0  2  0  0]
 [ 0 44  0  0  0  0  0  0  0  0]
 [ 2  0 46  0  0  0  0  1  0  0]
 [ 0  1  0 47  2  0  0  2  0  0]
 [ 1  1  3  0 48  0  0  2  0  0]
 [ 1  0  0  0  0 52  0  2  1  0]
 [ 0  0  0  0  1  0 25  0  0  0]
 [ 0  0  1  0  0  0  0 42  0  0]
 [ 0  1  0  2  1  0  0  0 38  0]
 [ 0  0  0  0  0  0  0  0  0 49]]


In [7]:
# Навчання та класифікація з використанням Random Forests
rf_classifier = RandomForestClassifier()
params_rf = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
grid_search_rf = GridSearchCV(rf_classifier, params_rf, cv=skf, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)
best_params_rf = grid_search_rf.best_params_
print("\nRandom Forests:")
print("Best parameters:", best_params_rf)
evaluate_classifier(grid_search_rf, X_test, y_test)


Random Forests:
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy: 0.9411764705882353
Precision: 0.9442922015150668
Recall: 0.9411764705882353
F1-Score: 0.9414318240426863
Cohen's Kappa: 0.9343823234014061
Confusion Matrix:
[[53  0  0  0  5  0  0  0  1  1]
 [ 0 42  0  0  1  0  0  0  0  1]
 [ 0  0 45  0  1  0  0  0  2  1]
 [ 1  1  0 46  2  0  0  0  1  1]
 [ 0  1  0  0 53  0  0  0  1  0]
 [ 0  0  0  0  0 56  0  0  0  0]
 [ 0  0  0  0  0  0 26  0  0  0]
 [ 0  0  0  0  0  2  0 41  0  0]
 [ 0  0  0  1  1  0  0  1 39  0]
 [ 0  1  1  0  0  0  0  0  0 47]]


In [8]:
# Навчання та класифікація з використанням Decision Trees
dt_classifier = DecisionTreeClassifier()
params_dt = {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
grid_search_dt = GridSearchCV(dt_classifier, params_dt, cv=skf, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)
best_params_dt = grid_search_dt.best_params_
print("\nDecision Trees:")
print("Best parameters:", best_params_dt)
evaluate_classifier(grid_search_dt, X_test, y_test)


Decision Trees:
Best parameters: {'max_depth': None, 'min_samples_split': 10}
Accuracy: 0.9222689075630253
Precision: 0.9254266384776296
Recall: 0.9222689075630253
F1-Score: 0.9224280415046797
Cohen's Kappa: 0.9132482168893092
Confusion Matrix:
[[54  1  1  0  4  0  0  0  0  0]
 [ 0 42  0  0  2  0  0  0  0  0]
 [ 0  0 45  2  1  0  0  1  0  0]
 [ 1  1  1 45  4  0  0  0  0  0]
 [ 1  0  1  0 51  0  1  0  0  1]
 [ 0  0  1  0  0 55  0  0  0  0]
 [ 0  0  0  1  0  0 25  0  0  0]
 [ 1  0  0  0  1  0  0 41  0  0]
 [ 1  0  0  3  0  1  0  2 34  1]
 [ 0  1  1  0  0  0  0  0  0 47]]


In [14]:
# Навчання та класифікація з використанням Linear Support Vector Machines

linear_svm_classifier = LinearSVC()
params_linear_svm = {'C': [0.1, 1, 10]}
grid_search_linear_svm = GridSearchCV(linear_svm_classifier, params_linear_svm, cv=skf, scoring='accuracy')
grid_search_linear_svm.fit(X_train, y_train)
best_params_linear_svm = grid_search_linear_svm.best_params_
print("\nLinear Support Vector Machines:")
print("Best parameters:", best_params_linear_svm)
evaluate_classifier(grid_search_linear_svm, X_test, y_test)


Linear Support Vector Machines:
Best parameters: {'C': 10}
Accuracy: 0.9474789915966386
Precision: 0.9490385097128281
Recall: 0.9474789915966386
F1-Score: 0.947448578417415
Cohen's Kappa: 0.9414182686390824
Confusion Matrix:
[[52  2  1  1  3  0  0  1  0  0]
 [ 0 43  0  0  1  0  0  0  0  0]
 [ 2  1 46  0  0  0  0  0  0  0]
 [ 0  1  0 47  2  0  0  2  0  0]
 [ 0  1  0  0 53  0  0  0  1  0]
 [ 0  0  0  0  0 55  0  0  1  0]
 [ 0  0  0  0  1  0 25  0  0  0]
 [ 0  0  1  0  0  0  0 42  0  0]
 [ 0  0  1  2  0  0  0  0 39  0]
 [ 0  0  0  0  0  0  0  0  0 49]]


In [11]:
# Навчання та класифікація з використанням Neural Network з Softmax Layer
mlp_classifier = MLPClassifier()
params_mlp = {'hidden_layer_sizes': [(100,), (50,), (100, 50)], 'activation': ['relu', 'tanh', 'logistic']}
grid_search_mlp = GridSearchCV(mlp_classifier, params_mlp, cv=3, scoring='accuracy')
grid_search_mlp.fit(X_train, y_train)
best_params_mlp = grid_search_mlp.best_params_
print("\nNeural Network with Softmax Layer:")
print("Best parameters:", best_params_mlp)
evaluate_classifier(grid_search_mlp, X_test, y_test)




Neural Network with Softmax Layer:
Best parameters: {'activation': 'relu', 'hidden_layer_sizes': (50,)}
Accuracy: 0.7899159663865546
Precision: 0.8038144935638695
Recall: 0.7899159663865546
F1-Score: 0.7889126085612518
Cohen's Kappa: 0.7655022242803726
Confusion Matrix:
[[43  1  2  3  8  1  0  2  0  0]
 [ 2 27  0  2  3  0  1  3  0  6]
 [ 0  0 36  0  3  2  1  2  1  4]
 [ 0  1  0 39  5  1  0  2  3  1]
 [ 0  0  0  2 51  0  0  0  2  0]
 [ 2  0  0  0  2 50  0  0  1  1]
 [ 1  0  0  1  1  0 23  0  0  0]
 [ 0  0  4  4  2  1  0 27  2  3]
 [ 1  0  1  2  1  0  0  0 37  0]
 [ 0  0  0  1  1  0  0  3  1 43]]
