# RF

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
with open("C:/Yee Ann/NUS/DSA4266/TruthSeeker2023/Features_For_Traditional_ML_Techniques.csv") as file:
    df = pd.read_csv(file)

In [3]:
#remove columns with s single unique value
df = df.drop(columns = ['Unnamed: 0'])
df = df.loc[:, df.nunique() > 1]
df = df.drop_duplicates()

# Convert boolean columns to 0 and 1
df['majority_target'] = df['majority_target'].astype(int)

Convert Embeddings into numerical columns

In [4]:
def replace_spaces_with_commas(emb_str):
    return emb_str.replace(' ', ',')

# Apply the function to the embeddings column
df['embeddings'] = df['embeddings'].apply(replace_spaces_with_commas)

# Function to extract the list from the string representation
def extract_list(emb_str):
    # Remove outer brackets and convert the string to a list
    return eval(emb_str.strip("[]"))

# Apply the function and create a DataFrame from the list of embeddings
embeddings_split = df['embeddings'].apply(extract_list).apply(pd.Series)

# Rename columns to reflect the split embedding positions
embeddings_split.columns = [f'embedding_{i}' for i in range(embeddings_split.shape[1])]

# Concatenate the original DataFrame with the new embeddings DataFrame
df = pd.concat([df, embeddings_split], axis=1)
df = df.drop(['embeddings'], axis=1)

Drop highly correlated features

In [5]:
#spliting the columns according to their types
num_col = df.select_dtypes(['int64', 'int32', 'float64'])
num_col = num_col.drop(columns = ['majority_target'])
obj_col = df.select_dtypes('object')

# Threshold for correlation
corr_matrix = num_col.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype('bool'))

# Drop highly correlated numerical features--anything with threshold > 0.75
to_drop = [column for column in upper.columns if any(upper[column] > 0.75)]
df = df.drop(to_drop, axis=1)

Basic RF

In [6]:
X = df.drop(['statement', 'tweet', 'majority_target'] , axis=1)
y = df['majority_target']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

# Define the model
rf = RandomForestClassifier(n_estimators=100, max_depth= 10, random_state=42)

# Train the Random Forest on the training data
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

In [7]:
# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")

# More detailed evaluation - Classification report
print("\nClassification Report: training set")
print(classification_report(y_train, y_pred_train, target_names=['Fake', 'Real']))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Fake', 'Real']))

# Confusion Matrix to understand predictions
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Random Forest Accuracy: 0.9576

Classification Report: training set
              precision    recall  f1-score   support

        Fake       0.96      0.96      0.96     45771
        Real       0.96      0.96      0.96     48167

    accuracy                           0.96     93938
   macro avg       0.96      0.96      0.96     93938
weighted avg       0.96      0.96      0.96     93938


Classification Report:
              precision    recall  f1-score   support

        Fake       0.95      0.96      0.96     19442
        Real       0.96      0.96      0.96     20818

    accuracy                           0.96     40260
   macro avg       0.96      0.96      0.96     40260
weighted avg       0.96      0.96      0.96     40260


Confusion Matrix:
[[18619   823]
 [  883 19935]]


RFE Elimination (select top n features and use them as X)

In [8]:
# Recursive Feature Elimination
rfe = RFE(estimator=rf, n_features_to_select=20)  # Select top 20 features
rfe.fit(X_train, y_train)

# Get the selected features
selected_features = X.columns[rfe.support_]
print("Selected Features: ", selected_features)

Selected Features:  Index(['BinaryNumTarget', 'followers_count', 'friends_count',
       'favourites_count', 'statuses_count', 'cred', 'normalize_influence',
       'hashtags', 'URLs', 'unique_count', 'PERSON_percentage',
       'MONEY_percentage', 'CARDINAL_percentage', 'Word count',
       'Average word length', 'present_verbs', 'past_verbs', 'pronouns',
       'capitals', 'embedding_10'],
      dtype='object')


In [9]:
X2 = df.drop(['majority_target', 'statement', 'tweet'], axis=1)
y2 = df['majority_target']
X2 = X2[selected_features]

# Split into training and test sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.3, random_state=42)

# Define the model
rf2 = RandomForestClassifier(n_estimators=100, max_depth= 10, random_state=42)

# Train the Random Forest on the training data
rf2.fit(X2_train, y2_train)

# Predict on the test set
y2_pred = rf2.predict(X2_test)
y2_pred_train = rf2.predict(X2_train)

In [10]:
# Evaluate the model using accuracy
accuracy = accuracy_score(y2_test, y2_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")

# More detailed evaluation - Classification report
print("\nClassification Report: training set")
print(classification_report(y2_train, y2_pred_train, target_names=['Fake', 'Real']))

print("\nClassification Report:")
print(classification_report(y2_test, y2_pred, target_names=['Fake', 'Real']))

# Confusion Matrix to understand predictions
print("\nConfusion Matrix:")
print(confusion_matrix(y2_test, y2_pred))

Random Forest Accuracy: 0.9576

Classification Report: training set
              precision    recall  f1-score   support

        Fake       0.96      0.96      0.96     45771
        Real       0.96      0.96      0.96     48167

    accuracy                           0.96     93938
   macro avg       0.96      0.96      0.96     93938
weighted avg       0.96      0.96      0.96     93938


Classification Report:
              precision    recall  f1-score   support

        Fake       0.95      0.96      0.96     19442
        Real       0.96      0.96      0.96     20818

    accuracy                           0.96     40260
   macro avg       0.96      0.96      0.96     40260
weighted avg       0.96      0.96      0.96     40260


Confusion Matrix:
[[18619   823]
 [  883 19935]]


TF-IDF

In [11]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500) 

# Fit and transform the statement and tweet columns
statement_tfidf = tfidf_vectorizer.fit_transform(df['statement'])
tweet_tfidf = tfidf_vectorizer.fit_transform(df['tweet'])

# Convert the TF-IDF matrices to DataFrames
statement_tfidf_df = pd.DataFrame(statement_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tweet_tfidf_df = pd.DataFrame(tweet_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Now, concatenate these DataFrames with the rest of your numerical data
df = pd.concat([df, statement_tfidf_df, tweet_tfidf_df], axis=1)

# Now you can drop the original 'statement' and 'tweet' columns and proceed with your Random Forest model
df_processed_text = df.drop(['statement', 'tweet'], axis=1)

In [12]:
X1 = df_processed_text.drop(['majority_target'], axis=1)
y1 = df_processed_text['majority_target']

# Split into training and test sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.3, random_state=42)

# Define the model
rf1 = RandomForestClassifier(n_estimators=100, max_depth= 10, random_state=42)

# Train the Random Forest on the training data
rf1.fit(X1_train, y1_train)

# Predict on the test set
y1_pred = rf1.predict(X1_test)
y1_pred_train = rf1.predict(X1_train)

In [13]:
# Evaluate the model using accuracy
accuracy1 = accuracy_score(y1_test, y1_pred)
print(f"Random Forest Accuracy: {accuracy1:.4f}")

# More detailed evaluation - Classification report
print("\nClassification Report: training set")
print(classification_report(y1_train, y1_pred_train, target_names=['Fake', 'Real']))

print("\nClassification Report:")
print(classification_report(y1_test, y1_pred, target_names=['Fake', 'Real']))

# Confusion Matrix to understand predictions
print("\nConfusion Matrix:")
print(confusion_matrix(y1_test, y1_pred))

Random Forest Accuracy: 0.9576

Classification Report: training set
              precision    recall  f1-score   support

        Fake       0.96      0.96      0.96     45771
        Real       0.96      0.96      0.96     48167

    accuracy                           0.96     93938
   macro avg       0.96      0.96      0.96     93938
weighted avg       0.96      0.96      0.96     93938


Classification Report:
              precision    recall  f1-score   support

        Fake       0.95      0.96      0.96     19442
        Real       0.96      0.96      0.96     20818

    accuracy                           0.96     40260
   macro avg       0.96      0.96      0.96     40260
weighted avg       0.96      0.96      0.96     40260


Confusion Matrix:
[[18618   824]
 [  882 19936]]


The core features in the dataset contain enough signal for the Random Forest model to reach high accuracy without extra feature engineering or dimensionality reduction techniques.