Random Forest

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib


In [24]:
data = pd.read_csv('Fake_news_content_detection.csv')

# Combine target columns into a single target variable
target_columns = ['Barely-True', 'False', 'Half-True', 'Mostly-True', 'Not-Known', 'True']
data['Target'] = data[target_columns].idxmax(axis=1)

In [25]:

# Drop rows with missing text data
data = data.dropna(subset=['Text'])

# Split data into features and target
X = data['Text']  # Feature: Text column
y = data['Target']  # Target: Newly created Target column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Vectorize Text Data
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [26]:

# Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=300, max_depth=None, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', random_state=42)
model.fit(X_train_vec, y_train)

# Make Predictions
y_pred = model.predict(X_test_vec)

In [None]:
'''# Set up the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

# Create the base model for GridSearchCV
rf = RandomForestClassifier(random_state=42)

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                          cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train_vec, y_train)

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Use the best model to make predictions
model = grid_search.best_estimator_
y_pred = model.predict(X_test_vec)'''

'# Set up the parameter grid for GridSearchCV\nparam_grid = {\n    \'n_estimators\': [100, 200, 300],\n    \'max_depth\': [None, 10, 20],\n    \'min_samples_split\': [2, 5],\n    \'min_samples_leaf\': [1, 2],\n    \'max_features\': [\'auto\', \'sqrt\', \'log2\']\n}\n\n# Create the base model for GridSearchCV\nrf = RandomForestClassifier(random_state=42)\n\n# Use GridSearchCV to find the best parameters\ngrid_search = GridSearchCV(estimator=rf, param_grid=param_grid,\n                          cv=5, n_jobs=-1, verbose=2, scoring=\'accuracy\')\ngrid_search.fit(X_train_vec, y_train)\n\n# Print the best parameters\nprint("Best parameters found: ", grid_search.best_params_)\n\n# Use the best model to make predictions\nmodel = grid_search.best_estimator_\ny_pred = model.predict(X_test_vec)'

In [28]:

# Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.26
Classification Report:
               precision    recall  f1-score   support

 Barely-True       0.30      0.15      0.19       331
       False       0.23      0.42      0.30       399
   Half-True       0.25      0.26      0.26       423
 Mostly-True       0.26      0.34      0.30       392
   Not-Known       0.55      0.07      0.12       168
        True       0.29      0.18      0.22       335

    accuracy                           0.26      2048
   macro avg       0.31      0.23      0.23      2048
weighted avg       0.29      0.26      0.25      2048



In [29]:
# Display Sample Predictions
sample_data = X_test.reset_index(drop=True)
predictions_df = pd.DataFrame({'Actual': y_test.reset_index(drop=True), 'Predicted': y_pred})
predictions_df['Text'] = sample_data
print(predictions_df.sample(20))


           Actual    Predicted  \
1829        False  Barely-True   
713         False  Mostly-True   
52          False        False   
1582    Half-True  Mostly-True   
856   Mostly-True  Mostly-True   
782          True        False   
1633    Half-True        False   
1432  Barely-True         True   
1690  Mostly-True  Mostly-True   
1341  Mostly-True  Mostly-True   
1811  Mostly-True  Barely-True   
1172        False  Barely-True   
706         False        False   
815          True         True   
453   Mostly-True        False   
1846        False    Half-True   
1227    Not-Known        False   
1613    Half-True    Half-True   
1450    Not-Known        False   
736   Mostly-True    Half-True   

                                                   Text  
1829  Obamacare means that for up to 20 million Amer...  
713   I dont think the argument can be credibly made...  
52    Says government is a barrier to innovation and...  
1582  Today, when people retire in Galveston County,.

In [30]:
# Save the model and vectorizer
#joblib.dump(model, 'random_forest_model.joblib')
#joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
