# Set up the envinronment

In [None]:
pip install pandas scikit-learn

: 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Read and prepare the dataset

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import copy

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

data_folder = '/content/drive/MyDrive/Colab Notebooks/Machine Learning/Fake News prediction/Datasets'

TrueNews_data = 'True.csv'
FakeNews_data = 'Fake.csv'

TrueNews_data_path = os.path.join(data_folder, TrueNews_data)
FakeNews_data_path = os.path.join(data_folder, FakeNews_data)

TNdf = pd.read_csv(TrueNews_data_path)
TNdf['label'] = 0
FNdf = pd.read_csv(FakeNews_data_path)
FNdf['label'] = 1

merged__FNdf__TNdf = pd.merge(FNdf, TNdf, how='outer')
merged__FNdf__TNdf = merged__FNdf__TNdf.sample(frac=1).reset_index(drop=True)
merged__FNdf__TNdf['context'] = merged__FNdf__TNdf['title'] +  ". " + merged__FNdf__TNdf['text']

X_train, X_test, y_train, y_test = train_test_split(merged__FNdf__TNdf['context'], merged__FNdf__TNdf['label'], test_size=0.3, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"Shape of X_train_tfidf: {X_train_tfidf.shape}")
print(f"Shape of X_test_tfidf: {X_test_tfidf.shape}")


Shape of X_train_tfidf: (31428, 5000)
Shape of X_test_tfidf: (13470, 5000)


# Build the models

## Decision Tree

In [5]:
dt = DecisionTreeClassifier()
dt.fit(X_train_tfidf, y_train)
y_pred_dt = dt.predict(X_test_tfidf)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

Decision Tree Accuracy: 0.9951744617668894
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6388
           1       1.00      1.00      1.00      7082

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



## Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_tfidf, y_train)
y_pred_rf = rf.predict(X_test_tfidf)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.9968819599109131
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6442
           1       1.00      1.00      1.00      7028

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



## Gradient Boosting


In [None]:
gb = GradientBoostingClassifier(n_estimators=100)
gb.fit(X_train_tfidf, y_train)
y_pred_gb = gb.predict(X_test_tfidf)

print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))

Gradient Boosting Accuracy: 0.9955456570155902
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6442
           1       1.00      0.99      1.00      7028

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



## Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_test_tfidf)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.988641425389755
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      6442
           1       0.99      0.99      0.99      7028

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



# Evaluate and improve models

## Feature Engineering

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

### Decision Tree


In [9]:
dt = DecisionTreeClassifier()
dt.fit(X_train_tfidf, y_train)
y_pred_dt = dt.predict(X_test_tfidf)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

Decision Tree Accuracy: 0.996362286562732
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6388
           1       1.00      1.00      1.00      7082

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



### Random Forest

In [10]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_tfidf, y_train)
y_pred_rf = rf.predict(X_test_tfidf)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.996807720861173
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6388
           1       1.00      1.00      1.00      7082

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



### Gradient Boosting

In [12]:
gb = GradientBoostingClassifier(n_estimators=100)
gb.fit(X_train_tfidf, y_train)
y_pred_gb = gb.predict(X_test_tfidf)

print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))

Gradient Boosting Accuracy: 0.9965107646622123
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6388
           1       1.00      1.00      1.00      7082

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



### Logistic Regression

In [13]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_test_tfidf)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.9893095768374165
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6388
           1       0.99      0.99      0.99      7082

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



## Hyperparameter Tuning

### Decision tree

In [None]:
from sklearn.model_selection import GridSearchCV

dt_params = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_dt = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=dt_params, cv=5, n_jobs=-1, verbose=2)
grid_search_dt.fit(X_train_tfidf, y_train)
best_dt = grid_search_dt.best_estimator_

y_pred_best_dt = best_dt.predict(X_test_tfidf)
print("Optimized Decision Tree Accuracy:", accuracy_score(y_test, y_pred_best_dt))
print(classification_report(y_test, y_pred_best_dt))


Fitting 5 folds for each of 36 candidates, totalling 180 fits


KeyboardInterrupt: 

### Random forest

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

random_search_rf = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=rf_params, n_iter=50, cv=5, n_jobs=-1, verbose=2, random_state=42)
random_search_rf.fit(X_train_tfidf, y_train)
best_rf = random_search_rf.best_estimator_

y_pred_best_rf = best_rf.predict(X_test_tfidf)
print("Optimized Random Forest Accuracy:", accuracy_score(y_test, y_pred_best_rf))
print(classification_report(y_test, y_pred_best_rf))


### Gradient Boosting

In [None]:
gb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

grid_search_gb = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=gb_params, cv=5, n_jobs=-1, verbose=2)
grid_search_gb.fit(X_train_tfidf, y_train)
best_gb = grid_search_gb.best_estimator_

y_pred_best_gb = best_gb.predict(X_test_tfidf)
print("Optimized Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_best_gb))
print(classification_report(y_test, y_pred_best_gb))

### Logistic Regression

In [None]:
lr_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}

grid_search_lr = GridSearchCV(estimator=LogisticRegression(max_iter=1000), param_grid=lr_params, cv=5, n_jobs=-1, verbose=2)
grid_search_lr.fit(X_train_tfidf, y_train)
best_lr = grid_search_lr.best_estimator_

y_pred_best_lr = best_lr.predict(X_test_tfidf)
print("Optimized Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_best_lr))
print(classification_report(y_test, y_pred_best_lr))

NameError: name 'GridSearchCV' is not defined

### Decision Tree


# Save the models

In [None]:
import joblib

joblib.dump(dt, 'decision_tree_model.pkl')
joblib.dump(rf, 'random_forest_model.pkl')
joblib.dump(gb, 'gradient_boosting_model.pkl')
joblib.dump(lr, 'logistic_regression_model.pkl')

['logistic_regression_model.pkl']