In [146]:
# necessary libraries imported.
import pandas as pd
import time

# classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

# regressors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor

# k 5-fold 
from sklearn.model_selection import train_test_split

# tf-idf and n-gram
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# for printing metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error, mean_squared_error

# feature selection and PCA
from sklearn.feature_selection import SelectKBest, mutual_info_classif, chi2, f_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [2]:
# data set uploaded.
file_path = './fake_news_csv_dataset_path'
df = pd.read_csv(file_path)

In [3]:
# review dataset
print(df.head())

   target                                              tweet  score
0    True  @POTUS Biden Blunders - 6 Month Update\n\nInfl...      5
1    True  @S0SickRick @Stairmaster_ @6d6f636869 Not as m...      3
2    True  THE SUPREME COURT is siding with super rich pr...      4
3    True  @POTUS Biden Blunders\n\nBroken campaign promi...      5
4    True  @OhComfy I agree. The confluence of events rig...      4


In [4]:
# seperated dataset to train test splits
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['target'], test_size=0.2, random_state=35)

In [5]:
# tf-idf vectorizing, max features set up to 3000
tfidf_vectorizer = TfidfVectorizer(max_features=3000) 
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [6]:
# knn classifier model created
knn_classifier = KNeighborsClassifier(n_neighbors=5)
start_time = time.time()
knn_classifier.fit(X_train_tfidf, y_train)
training_time = time.time() - start_time

In [7]:
# making prediction here
start_time = time.time()
y_pred = knn_classifier.predict(X_test_tfidf)
testing_time = time.time() - start_time

found 0 physical cores < 1
  File "C:\Users\atahi\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


In [8]:
# performence metrics calculated
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [9]:
# printing results
print("\n--- Sonuçlar ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features: {X_train_tfidf.shape[1]}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Sonuçlar ---
Accuracy: 0.7191877794336811
F-Measure: 0.7850808406284754
Precision: 0.6464428269546842
Recall: 0.9994191955858864
Number of Features: 3000
Training Time: 0.01952648162841797 seconds
Testing Time: 144.1406581401825 seconds


In [13]:
# naive bayes classifier created
nb_classifier = MultinomialNB()
start_time = time.time()
nb_classifier.fit(X_train_tfidf, y_train)
training_time = time.time() - start_time

In [14]:
# making prediction here
start_time = time.time()
y_pred = nb_classifier.predict(X_test_tfidf)
testing_time = time.time() - start_time

In [15]:
# performance metrics calculated
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [16]:
# printing results
print("\n--- Sonuçlar ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features: {X_train_tfidf.shape[1]}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Sonuçlar ---
Accuracy: 0.9364008941877794
F-Measure: 0.9385285750297094
Precision: 0.9311182565201858
Recall: 0.9460577900392043
Number of Features: 3000
Training Time: 0.022007226943969727 seconds
Testing Time: 0.004040718078613281 seconds


In [6]:
# decision tree gini classifier created
dt_classifier = DecisionTreeClassifier(criterion='gini',random_state=23)
start_time = time.time()
dt_classifier.fit(X_train_tfidf, y_train)
training_time = time.time() - start_time

In [10]:
# making prediction here
start_time = time.time()
y_pred = dt_classifier.predict(X_test_tfidf)
testing_time = time.time() - start_time

In [11]:
# performance metrics calculated
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [12]:
# printing results
print("\n--- Sonuçlar ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features: {X_train_tfidf.shape[1]}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Sonuçlar ---
Accuracy: 0.9698956780923994
F-Measure: 0.9707585408222351
Precision: 0.9678164237263674
Recall: 0.973718600261362
Number of Features: 3000
Training Time: 166.694317817688 seconds
Testing Time: 0.037380218505859375 seconds


In [15]:
# feature selection (SelectKBest), i set up 200 k_best features
k_best = 200
feature_selector = SelectKBest(mutual_info_classif, k=k_best)
X_train_selected = feature_selector.fit_transform(X_train_tfidf, y_train)
X_test_selected = feature_selector.transform(X_test_tfidf)









































































































































In [16]:
# ignored warnings and creating classifier again
knn_classifier = KNeighborsClassifier(n_neighbors=5)
start_time = time.time()
knn_classifier.fit(X_train_selected, y_train)
training_time = time.time() - start_time

In [17]:
# making prediction here
start_time = time.time()
y_pred = knn_classifier.predict(X_test_selected)
testing_time = time.time() - start_time

found 0 physical cores < 1
  File "C:\Users\atahi\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


In [18]:
# performance metrics calculating
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [19]:
# printing results
print("\n--- KNN + TF-IDF + Feature Selection (Mutual Information) Sonuçları ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features: {k_best}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- KNN + TF-IDF + Feature Selection (Mutual Information) Sonuçları ---
Accuracy: 0.7625931445603576
F-Measure: 0.7819451098487441
Precision: 0.7395779388917659
Recall: 0.8294613039059097
Number of Features: 200
Training Time: 0.013462543487548828 seconds
Testing Time: 56.81277871131897 seconds


In [20]:
# naive bayes classifier created again
nb_classifier = MultinomialNB()
start_time = time.time()
nb_classifier.fit(X_train_selected, y_train)
training_time = time.time() - start_time

In [21]:
# making prediction here
start_time = time.time()
y_pred = nb_classifier.predict(X_test_selected)
testing_time = time.time() - start_time

In [22]:
# performance metrics calculating
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [23]:
# printing results
print("\n--- Naive Bayes + TF-IDF + Feature Selection (Mutual Information) Sonuçları ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features: {k_best}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- KNN + TF-IDF + Feature Selection (Mutual Information) Sonuçları ---
Accuracy: 0.7663934426229508
F-Measure: 0.7797836470918796
Precision: 0.7552728262348619
Recall: 0.805938725134311
Number of Features: 200
Training Time: 0.021150827407836914 seconds
Testing Time: 0.004522562026977539 seconds


In [24]:
# decision tree gini classifier created again
dt_classifier = DecisionTreeClassifier(criterion='gini',random_state=23)
start_time = time.time()
dt_classifier.fit(X_train_selected, y_train)
training_time = time.time() - start_time

In [25]:
# making prediction here
start_time = time.time()
y_pred = dt_classifier.predict(X_test_selected)
testing_time = time.time() - start_time

In [27]:
# performance metrics calculating
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [28]:
# printing results
print("\n--- Decision Tree + TF-IDF + Feature Selection (Mutual Information) Sonuçları ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features: {k_best}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Decision Tree + TF-IDF + Feature Selection (Mutual Information) Sonuçları ---
Accuracy: 0.7788002980625931
F-Measure: 0.7843365178538996
Precision: 0.7848782260996001
Recall: 0.783795556846232
Number of Features: 200
Training Time: 43.933146238327026 seconds
Testing Time: 0.014463186264038086 seconds


In [41]:
# Min-Max scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_tfidf.toarray())
X_test_scaled = scaler.transform(X_test_tfidf.toarray())

# PCA size reduction i tried 500 for that
n_components = 500 
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [32]:
# knn classifier created again
knn_classifier = KNeighborsClassifier(n_neighbors=5)
start_time = time.time()
knn_classifier.fit(X_train_pca, y_train)
training_time = time.time() - start_time

In [33]:
# making prediction here
start_time = time.time()
y_pred = knn_classifier.predict(X_test_pca)
testing_time = time.time() - start_time

In [34]:
# performance metrics calculating
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [35]:
# printing results
print("\n--- KNN + TF-IDF + PCA Sonuçları ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features (After PCA): {n_components}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- KNN + TF-IDF + PCA Sonuçları ---
Accuracy: 0.9286512667660208
F-Measure: 0.9337393169786513
Precision: 0.8919812256230581
Recall: 0.9795992449542616
Number of Features (After PCA): 500
Training Time: 0.173109769821167 seconds
Testing Time: 8.97448444366455 seconds


In [43]:
# decision tree classifier created again
dt_classifier = DecisionTreeClassifier(criterion='gini',random_state=23)
start_time = time.time()
dt_classifier.fit(X_train_pca, y_train)
training_time = time.time() - start_time

In [44]:
# making prediction here
start_time = time.time()
y_pred = dt_classifier.predict(X_test_pca)
testing_time = time.time() - start_time

In [45]:
# performance metrics calculating
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [46]:
# printing results
print("\n--- Decision Tree + TF-IDF + PCA Sonuçları ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features (After PCA): {n_components}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Decision Tree + TF-IDF + PCA Sonuçları ---
Accuracy: 0.7413561847988077
F-Measure: 0.7478204010462075
Precision: 0.7483641122582522
Recall: 0.7472774793088427
Number of Features (After PCA): 500
Training Time: 307.13799834251404 seconds
Testing Time: 0.04506278038024902 seconds


In [48]:
# n-gram using for vectorizing process
ngram_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=6000)
X_train_ngram = ngram_vectorizer.fit_transform(X_train)
X_test_ngram = ngram_vectorizer.transform(X_test)

In [49]:
# knn classifier created here
knn_classifier = KNeighborsClassifier(n_neighbors=5)
start_time = time.time()
knn_classifier.fit(X_train_ngram, y_train)
training_time = time.time() - start_time

In [50]:
# making prediction here
start_time = time.time()
y_pred = knn_classifier.predict(X_test_ngram)
testing_time = time.time() - start_time

In [51]:
# performance metrics calculating here
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [52]:
print("\n--- KNN + n-gram Sonuçları ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features (After n-gram): {X_train_ngram.shape[1]}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- KNN + n-gram Sonuçları ---
Accuracy: 0.8944113263785395
F-Measure: 0.9017813821307271
Precision: 0.8627320954907162
Recall: 0.9445331784521562
Number of Features (After n-gram): 6000
Training Time: 0.019441604614257812 seconds
Testing Time: 123.06727433204651 seconds


In [53]:
# naive bayes classifier created here
nb_classifier = MultinomialNB()
start_time = time.time()
nb_classifier.fit(X_train_ngram, y_train)
training_time = time.time() - start_time

In [54]:
# making prediction here
start_time = time.time()
y_pred = nb_classifier.predict(X_test_ngram)
testing_time = time.time() - start_time

In [55]:
# performance metrics calculating
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [56]:
# printing results
print("\n--- Naive Bayes + n-gram Sonuçları ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features (After n-gram): {X_train_ngram.shape[1]}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Naive Bayes + n-gram Sonuçları ---
Accuracy: 0.9434426229508197
F-Measure: 0.9447799199708985
Precision: 0.9467774861475649
Recall: 0.9427907652098156
Number of Features (After n-gram): 6000
Training Time: 0.025516748428344727 seconds
Testing Time: 0.007265806198120117 seconds


In [57]:
# decision tree classifier created again
dt_classifier = DecisionTreeClassifier(criterion='gini',random_state=23)
start_time = time.time()
dt_classifier.fit(X_train_ngram, y_train)
training_time = time.time() - start_time

In [58]:
# making prediction here
start_time = time.time()
y_pred = dt_classifier.predict(X_test_ngram)
testing_time = time.time() - start_time

In [59]:
# performance metrics calculating
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [60]:
# printing results
print("\n--- Decision Tree + n-gram Sonuçları ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features (After n-gram): {X_train_ngram.shape[1]}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Decision Tree + n-gram Sonuçları ---
Accuracy: 0.9751862891207154
F-Measure: 0.9758082092262985
Precision: 0.9764466414655423
Recall: 0.9751706112966458
Number of Features (After n-gram): 6000
Training Time: 48.77558970451355 seconds
Testing Time: 0.019765615463256836 seconds


In [62]:
# feature selection (chi-squared test)
k_best = 1000
feature_selector = SelectKBest(chi2, k=k_best)
X_train_selected = feature_selector.fit_transform(X_train_ngram, y_train)
X_test_selected = feature_selector.transform(X_test_ngram)

In [63]:
# knn classifier created again
knn_classifier = KNeighborsClassifier(n_neighbors=5)
start_time = time.time()
knn_classifier.fit(X_train_selected, y_train)
training_time = time.time() - start_time

In [64]:
# making prediction here
start_time = time.time()
y_pred = knn_classifier.predict(X_test_selected)
testing_time = time.time() - start_time

In [65]:
# performance metrics calculating
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [66]:
# print results
print("\n--- KNN + n-gram + Feature Selection Sonuçları ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features (After Feature Selection): {k_best}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- KNN + n-gram + Feature Selection Sonuçları ---
Accuracy: 0.8961997019374068
F-Measure: 0.9008893632159374
Precision: 0.8832310267857143
Recall: 0.919268186438217
Number of Features (After Feature Selection): 1000
Training Time: 0.010404109954833984 seconds
Testing Time: 92.23212599754333 seconds


In [67]:
# naive bayes classifier created again
nb_classifier = MultinomialNB()
start_time = time.time()
nb_classifier.fit(X_train_selected, y_train)
training_time = time.time() - start_time

In [68]:
# making prediction here
start_time = time.time()
y_pred = nb_classifier.predict(X_test_selected)
testing_time = time.time() - start_time

In [69]:
# performance metrics calculating
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [70]:
# print results
print("\n--- Naive Bayes + n-gram + Feature Selection Sonuçları ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features (After Feature Selection): {k_best}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Naive Bayes + n-gram + Feature Selection Sonuçları ---
Accuracy: 0.9199329359165425
F-Measure: 0.9215950965011492
Precision: 0.9262926292629263
Recall: 0.9169449687817628
Number of Features (After Feature Selection): 1000
Training Time: 0.019061565399169922 seconds
Testing Time: 0.002003908157348633 seconds


In [71]:
# decision tree classifier created again
dt_classifier = DecisionTreeClassifier(criterion='gini',random_state=23)
start_time = time.time()
dt_classifier.fit(X_train_selected, y_train)
training_time = time.time() - start_time

In [72]:
# making prediction here
start_time = time.time()
y_pred = dt_classifier.predict(X_test_selected)
testing_time = time.time() - start_time

In [73]:
# performance metrics calculating
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [74]:
# print results
print("\n--- Decision Tree + n-gram + Feature Selection Sonuçları ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features (After Feature Selection): {k_best}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Decision Tree + n-gram + Feature Selection Sonuçları ---
Accuracy: 0.9226900149031296
F-Measure: 0.9236655262480226
Precision: 0.9362368558430905
Recall: 0.911427326847684
Number of Features (After Feature Selection): 1000
Training Time: 14.19485878944397 seconds
Testing Time: 0.011683940887451172 seconds


In [75]:
# PCA ile boyut indirgeme
n_components = 300
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_ngram.toarray())
X_test_pca = pca.transform(X_test_ngram.toarray())

In [76]:
# knn classifier created again
knn_classifier = KNeighborsClassifier(n_neighbors=5)
start_time = time.time()
knn_classifier.fit(X_train_pca, y_train)
training_time = time.time() - start_time

In [77]:
# making prediction here
start_time = time.time()
y_pred = knn_classifier.predict(X_test_pca)
testing_time = time.time() - start_time

In [78]:
# performance metrics calculating
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [79]:
# printing results
print("\n--- KNN + n-gram + PCA Sonuçları ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features (After PCA): {n_components}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- KNN + n-gram + PCA Sonuçları ---
Accuracy: 0.8509687034277198
F-Measure: 0.859392575928009
Precision: 0.8330380264413247
Recall: 0.8874691447655002
Number of Features (After PCA): 300
Training Time: 0.12923574447631836 seconds
Testing Time: 5.732544422149658 seconds


In [81]:
# decision tree classifier created again
dt_classifier = DecisionTreeClassifier(criterion='gini',random_state=23)
start_time = time.time()
dt_classifier.fit(X_train_pca, y_train)
training_time = time.time() - start_time

In [83]:
# making prediction here
start_time = time.time()
y_pred = dt_classifier.predict(X_test_pca)
testing_time = time.time() - start_time

In [84]:
# performance metrics calculating
accuracy = accuracy_score(y_test, y_pred)
f_measure = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

In [85]:
# printing results
print("\n--- Decision Tree + n-gram + PCA Sonuçları ---")
print(f"Accuracy: {accuracy}")
print(f"F-Measure: {f_measure}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Number of Features (After PCA): {n_components}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Decision Tree + n-gram + PCA Sonuçları ---
Accuracy: 0.7257824143070045
F-Measure: 0.7317197637967486
Precision: 0.7347730600292826
Recall: 0.7286917380572092
Number of Features (After PCA): 300
Training Time: 154.38585472106934 seconds
Testing Time: 0.03141641616821289 seconds


In [87]:
# again making train test splits
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['score'], test_size=0.2, random_state=87)

# tf-idf vectorizing again
tfidf_vectorizer = TfidfVectorizer(max_features=7000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [88]:
# knn regressor model training
knn_regressor = KNeighborsRegressor(n_neighbors=5)
start_time = time.time()
knn_regressor.fit(X_train_tfidf, y_train)
training_time = time.time() - start_time

In [89]:
# making prediction here
start_time = time.time()
y_pred = knn_regressor.predict(X_test_tfidf)
testing_time = time.time() - start_time

In [90]:
# calculating performance metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [91]:
# prining results
print("\n--- KNN Regresyon + TF-IDF Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After TF-IDF): {X_train_tfidf.shape[1]}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- KNN Regresyon + TF-IDF Sonuçları ---
MAE: 0.7166840536512669
RMSE: 0.876181765298746
Number of Features (After TF-IDF): 7000
Training Time: 0.016933917999267578 seconds
Testing Time: 170.80575919151306 seconds


In [93]:
# training naive bayes regressor
nb_regressor = GaussianNB()
start_time = time.time()
nb_regressor.fit(X_train_tfidf.toarray(), y_train)
training_time = time.time() - start_time

In [94]:
# making prediction here
start_time = time.time()
y_pred = nb_regressor.predict(X_test_tfidf.toarray())
testing_time = time.time() - start_time

In [95]:
# calculating performance metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [96]:
print("\n--- Naive Bayes Regresyon + TF-IDF Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After TF-IDF): {X_train_tfidf.shape[1]}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Naive Bayes Regresyon + TF-IDF Sonuçları ---
MAE: 1.7407600596125186
RMSE: 2.043874203215379
Number of Features (After TF-IDF): 7000
Training Time: 46.50922632217407 seconds
Testing Time: 8.007112264633179 seconds


In [100]:
# training decision tree regressor
dt_regressor = DecisionTreeRegressor(criterion='friedman_mse', random_state=42)
start_time = time.time()
dt_regressor.fit(X_train_tfidf, y_train)
training_time = time.time() - start_time

In [101]:
# making prediction here
start_time = time.time()
y_pred = dt_regressor.predict(X_test_tfidf.toarray())
testing_time = time.time() - start_time

In [102]:
# calculating performance metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [103]:
print("\n--- Decision Tree Regresyon + TF-IDF Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After TF-IDF): {X_train_tfidf.shape[1]}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Decision Tree Regresyon + TF-IDF Sonuçları ---
MAE: 0.8362988091700657
RMSE: 1.1304758286613363
Number of Features (After TF-IDF): 7000
Training Time: 610.4818692207336 seconds
Testing Time: 4.340506553649902 seconds


In [104]:
# feature selection (chi-squared test)
k_best = 1000
feature_selector = SelectKBest(chi2, k=k_best)
X_train_selected = feature_selector.fit_transform(X_train_tfidf, y_train)
X_test_selected = feature_selector.transform(X_test_tfidf)

In [107]:
# knn regressor model training
knn_regressor = KNeighborsRegressor(n_neighbors=5)
start_time = time.time()
knn_regressor.fit(X_train_selected, y_train)
training_time = time.time() - start_time

In [108]:
# making prediction here
start_time = time.time()
y_pred = knn_regressor.predict(X_test_selected)
testing_time = time.time() - start_time

In [109]:
# performance metrics calculating
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [110]:
# printing results
print("\n--- KNN Regresyon + TF-IDF + Feature Selection Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After Feature Selection): {k_best}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- KNN Regresyon + TF-IDF + Feature Selection Sonuçları ---
MAE: 0.7089269746646797
RMSE: 0.8856952273595649
Number of Features (After Feature Selection): 1000
Training Time: 0.006996870040893555 seconds
Testing Time: 5.634212493896484 seconds


In [111]:
# naive bayes regressor model training
nb_regressor = GaussianNB()
start_time = time.time()
nb_regressor.fit(X_train_selected.toarray(), y_train)
training_time = time.time() - start_time

In [112]:
# making prediction here
start_time = time.time()
y_pred = nb_regressor.predict(X_test_selected.toarray())
testing_time = time.time() - start_time

In [113]:
# performance metrics calculating
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [114]:
# printing results
print("\n--- Naive Bayes Regresyon + TF-IDF + Feature Selection Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After Feature Selection): {k_best}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Naive Bayes Regresyon + TF-IDF + Feature Selection Sonuçları ---
MAE: 2.7866244411326377
RMSE: 2.966937785185669
Number of Features (After Feature Selection): 1000
Training Time: 3.221574544906616 seconds
Testing Time: 1.4512667655944824 seconds


In [115]:
# decision tree regressor model training
dt_regressor = DecisionTreeRegressor(criterion='friedman_mse', random_state=42)
start_time = time.time()
dt_regressor.fit(X_train_selected, y_train)
training_time = time.time() - start_time

In [116]:
# making prediction here
start_time = time.time()
y_pred = dt_regressor.predict(X_test_selected)
testing_time = time.time() - start_time

In [117]:
# calculating metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [118]:
# printing results
print("\n--- Decision Tree Regresyon + TF-IDF + Feature Selection Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After Feature Selection): {k_best}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Decision Tree Regresyon + TF-IDF + Feature Selection Sonuçları ---
MAE: 0.8179131872611023
RMSE: 1.1014849755315474
Number of Features (After Feature Selection): 1000
Training Time: 124.93194460868835 seconds
Testing Time: 0.1012430191040039 seconds


In [119]:
# pca 
n_components = 1000 
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_tfidf.toarray())
X_test_pca = pca.transform(X_test_tfidf.toarray())

In [120]:
# knn regressor model training
knn_regressor = KNeighborsRegressor(n_neighbors=5)
start_time = time.time()
knn_regressor.fit(X_train_pca, y_train)
training_time = time.time() - start_time

In [121]:
# making prediction
start_time = time.time()
y_pred = knn_regressor.predict(X_test_pca)
testing_time = time.time() - start_time

In [122]:
# performance metrics calculating
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [123]:
# printing results
print("\n--- KNN Regresyon + TF-IDF + PCA Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After PCA): {n_components}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- KNN Regresyon + TF-IDF + PCA Sonuçları ---
MAE: 0.7169001490312965
RMSE: 0.8883850617180342
Number of Features (After PCA): 1000
Training Time: 0.4556596279144287 seconds
Testing Time: 26.42832851409912 seconds


In [124]:
# naive bayes regressor training
nb_regressor = GaussianNB()
start_time = time.time()
nb_regressor.fit(X_train_pca, y_train)
training_time = time.time() - start_time

In [125]:
# making prediction here
start_time = time.time()
y_pred = nb_regressor.predict(X_test_pca)
testing_time = time.time() - start_time

In [126]:
# performance metrics calculating
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [127]:
# printing results
print("\n--- Naive Bayes Regresyon + TF-IDF + PCA Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After PCA): {n_components}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Naive Bayes Regresyon + TF-IDF + PCA Sonuçları ---
MAE: 0.9137481371087929
RMSE: 1.2724145347083664
Number of Features (After PCA): 1000
Training Time: 1.7189741134643555 seconds
Testing Time: 1.4299910068511963 seconds


In [128]:
# decision tree regressor training
dt_regressor = DecisionTreeRegressor(criterion='friedman_mse', random_state=42)
start_time = time.time()
dt_regressor.fit(X_train_pca, y_train)
training_time = time.time() - start_time

In [129]:
# making prediction
start_time = time.time()
y_pred = dt_regressor.predict(X_test_pca)
testing_time = time.time() - start_time

In [130]:
# performance metrics calculations
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [131]:
# printing results
print("\n--- Decision Tree Regresyon + TF-IDF + PCA Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After PCA): {n_components}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Decision Tree Regresyon + TF-IDF + PCA Sonuçları ---
MAE: 0.8611486533968119
RMSE: 1.1622622951733506
Number of Features (After PCA): 1000
Training Time: 1228.1140096187592 seconds
Testing Time: 0.15682482719421387 seconds


In [132]:
# n-gram vectorizing
ngram_vectorizer = CountVectorizer(ngram_range=(1, 4), max_features=5000)
X_train_ngram = ngram_vectorizer.fit_transform(X_train)
X_test_ngram = ngram_vectorizer.transform(X_test)

In [133]:
knn_regressor = KNeighborsRegressor(n_neighbors=5)
start_time = time.time()
knn_regressor.fit(X_train_ngram, y_train)
training_time = time.time() - start_time

In [134]:
# making prediction here
start_time = time.time()
y_pred = knn_regressor.predict(X_test_ngram)
testing_time = time.time() - start_time

In [135]:
# calculating performcance metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [136]:
# printing results
print("\n--- KNN Regresyon + n-gram Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After n-gram): {X_train_ngram.shape[1]}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- KNN Regresyon + n-gram Sonuçları ---
MAE: 0.7189865871833085
RMSE: 0.8940822168191194
Number of Features (After n-gram): 5000
Training Time: 0.012627840042114258 seconds
Testing Time: 126.94551372528076 seconds


In [137]:
# naive bayes regressor training
nb_regressor = GaussianNB()
start_time = time.time()
nb_regressor.fit(X_train_ngram.toarray(), y_train)
training_time = time.time() - start_time

In [138]:
# making prediction here
start_time = time.time()
y_pred = nb_regressor.predict(X_test_ngram.toarray())
testing_time = time.time() - start_time

In [139]:
# calculating performance metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [140]:
# printing results
print("\n--- Naive Bayes Regresyon + n-gram Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After n-gram): {X_train_ngram.shape[1]}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Naive Bayes Regresyon + n-gram Sonuçları ---
MAE: 1.632451564828614
RMSE: 1.98583846181487
Number of Features (After n-gram): 5000
Training Time: 13.650327920913696 seconds
Testing Time: 5.382228851318359 seconds


In [141]:
# decision tree regressor training
dt_regressor = DecisionTreeRegressor(criterion='friedman_mse', random_state=42)
start_time = time.time()
dt_regressor.fit(X_train_ngram, y_train)
training_time = time.time() - start_time

In [142]:
# making prediction here
start_time = time.time()
y_pred = dt_regressor.predict(X_test_ngram)
testing_time = time.time() - start_time

In [143]:
# calculating performance metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [144]:
# printing results
print("\n--- Decision Tree Regresyon + n-gram Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After n-gram): {X_train_ngram.shape[1]}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Decision Tree Regresyon + n-gram Sonuçları ---
MAE: 0.8278198422572717
RMSE: 1.1157856582889147
Number of Features (After n-gram): 5000
Training Time: 243.63501858711243 seconds
Testing Time: 0.02976393699645996 seconds


In [147]:
# feature selection (f-regression test)
k_best = 1000
feature_selector = SelectKBest(f_regression, k=k_best)
X_train_selected = feature_selector.fit_transform(X_train_ngram, y_train)
X_test_selected = feature_selector.transform(X_test_ngram)

In [148]:
# knn regressor model training
knn_regressor = KNeighborsRegressor(n_neighbors=5)
start_time = time.time()
knn_regressor.fit(X_train_selected, y_train)
training_time = time.time() - start_time

In [149]:
# making prediction here
start_time = time.time()
y_pred = knn_regressor.predict(X_test_selected)
testing_time = time.time() - start_time

In [150]:
# calculating performance metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [151]:
# printing results
print("\n--- KNN Regresyon + n-gram + Feature Selection Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After Feature Selection): {k_best}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- KNN Regresyon + n-gram + Feature Selection Sonuçları ---
MAE: 0.7200074515648286
RMSE: 0.8919101556672703
Number of Features (After Feature Selection): 1000
Training Time: 0.003463268280029297 seconds
Testing Time: 63.428171157836914 seconds


In [153]:
# naive bayes regressor model training
nb_regressor = GaussianNB()
start_time = time.time()
nb_regressor.fit(X_train_selected.toarray(), y_train)
training_time = time.time() - start_time

In [155]:
# making prediction here
start_time = time.time()
y_pred = nb_regressor.predict(X_test_selected.toarray())
testing_time = time.time() - start_time

In [156]:
# calculating performance metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [157]:
# printing results
print("\n--- Naive Bayes Regresyon + n-gram + Feature Selection Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After Feature Selection): {k_best}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Naive Bayes Regresyon + n-gram + Feature Selection Sonuçları ---
MAE: 2.4296199701937407
RMSE: 2.7341945198049
Number of Features (After Feature Selection): 1000
Training Time: 1.5466785430908203 seconds
Testing Time: 1.113072395324707 seconds


In [158]:
# decision tree regressor model training
dt_regressor = DecisionTreeRegressor(criterion='friedman_mse', random_state=42)
start_time = time.time()
dt_regressor.fit(X_train_selected, y_train)
training_time = time.time() - start_time

In [159]:
# making prediction here
start_time = time.time()
y_pred = dt_regressor.predict(X_test_selected)
testing_time = time.time() - start_time

In [160]:
# calculating performance metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [161]:
# printing results
print("\n--- Decision Tree Regresyon + n-gram + Feature Selection Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After Feature Selection): {k_best}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Decision Tree Regresyon + n-gram + Feature Selection Sonuçları ---
MAE: 0.8094795637978834
RMSE: 1.0724838098316916
Number of Features (After Feature Selection): 1000
Training Time: 41.1659722328186 seconds
Testing Time: 0.02485203742980957 seconds


In [162]:
# pca
n_components = 1000
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_ngram.toarray())
X_test_pca = pca.transform(X_test_ngram.toarray())

In [163]:
# knn regressor model training
knn_regressor = KNeighborsRegressor(n_neighbors=5)
start_time = time.time()
knn_regressor.fit(X_train_pca, y_train)
training_time = time.time() - start_time

In [164]:
# making prediction here
start_time = time.time()
y_pred = knn_regressor.predict(X_test_pca)
testing_time = time.time() - start_time

In [165]:
# performance metrics calculating
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [166]:
# printing results
print("\n--- KNN Regresyon + n-gram + PCA Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After PCA): {n_components}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- KNN Regresyon + n-gram + PCA Sonuçları ---
MAE: 0.7189046199701936
RMSE: 0.8912415359541072
Number of Features (After PCA): 1000
Training Time: 0.29863476753234863 seconds
Testing Time: 18.194344758987427 seconds


In [167]:
# naive bayes regressor model training
nb_regressor = GaussianNB()
start_time = time.time()
nb_regressor.fit(X_train_pca, y_train)
training_time = time.time() - start_time

In [168]:
# making prediction here
start_time = time.time()
y_pred = nb_regressor.predict(X_test_pca)
testing_time = time.time() - start_time

In [169]:
# calculating performance metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [170]:
# printing results
print("\n--- Naive Bayes Regresyon + n-gram + PCA Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After PCA): {n_components}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Naive Bayes Regresyon + n-gram + PCA Sonuçları ---
MAE: 0.9554769001490313
RMSE: 1.3320954884924718
Number of Features (After PCA): 1000
Training Time: 1.317770004272461 seconds
Testing Time: 0.9919209480285645 seconds


In [171]:
# decision tree regressor training
dt_regressor = DecisionTreeRegressor(criterion='friedman_mse', random_state=42)
start_time = time.time()
dt_regressor.fit(X_train_pca, y_train)
training_time = time.time() - start_time

In [172]:
# making prediction here
start_time = time.time()
y_pred = dt_regressor.predict(X_test_pca)
testing_time = time.time() - start_time

In [173]:
# calculating performance metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [174]:
# printing results
print("\n--- Decision Tree Regresyon + n-gram + PCA Sonuçları ---")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Number of Features (After PCA): {n_components}")
print(f"Training Time: {training_time} seconds")
print(f"Testing Time: {testing_time} seconds")


--- Decision Tree Regresyon + n-gram + PCA Sonuçları ---
MAE: 0.8581017897316311
RMSE: 1.1536653861862698
Number of Features (After PCA): 1000
Training Time: 1193.5766344070435 seconds
Testing Time: 0.10904955863952637 seconds
