In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import hstack, csr_matrix

# Load the dataset
df = pd.read_csv('Features_For_Traditional_ML_Techniques.csv')

# Extract the tweet text and labels
tweets = df['tweet'].astype(str).values  # Using 'tweet' as the text input
labels = df['majority_target'].values  # Assuming this is the target label

# Create a TF-IDF vectorizer with a limited vocabulary size
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 words
tfidf_features = vectorizer.fit_transform(tweets)  # Keep this as a sparse matrix

# Apply TruncatedSVD for dimensionality reduction on TF-IDF features
svd = TruncatedSVD(n_components=100, random_state=42)  # Reduce to 100 components
tfidf_reduced = svd.fit_transform(tfidf_features)

# Combine TF-IDF reduced features with the rest of your features
extra_features = df.drop(columns=['tweet', 'statement', 'majority_target', 'BinaryNumTarget'])  # Drop non-feature columns

# Ensure all extra features are numeric, convert or fill NaNs
extra_features = extra_features.apply(pd.to_numeric, errors='coerce')  # Convert to numeric, coercing errors to NaN
extra_features.fillna(0, inplace=True)  # Fill NaN values with 0 (or another strategy)

# Convert extra features to a sparse matrix
extra_features_sparse = csr_matrix(extra_features.values)

# Combine the extra sparse features with the reduced TF-IDF features
combined_features = hstack([extra_features_sparse, csr_matrix(tfidf_reduced)])  # Combine both as sparse matrices

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, labels, test_size=0.3, random_state=42)

# Create and train the Random Forest Classifier with optimizations
rf_model = RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=42)  # 50 trees and use all CPU cores
rf_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9407352210630899
Classification Report:
               precision    recall  f1-score   support

       False       0.94      0.93      0.94     19442
        True       0.94      0.95      0.94     20818

    accuracy                           0.94     40260
   macro avg       0.94      0.94      0.94     40260
weighted avg       0.94      0.94      0.94     40260



In [21]:
from sklearn.preprocessing import StandardScaler

from scipy.sparse import hstack, csr_matrix# Load the dataset
df = pd.read_csv('Features_For_Traditional_ML_Techniques.csv')
# Initialize the lemmatizer and stopwords
df = df.drop('Unnamed: 0', axis=1) #dropping this column

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess a single tweet
def preprocess_tweet(tweet):
    # 1. Convert to lowercase
    tweet = tweet.lower()
    
    # 2. Remove URLs, mentions (@username), hashtags, and special characters
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet)  # Remove URLs
    tweet = re.sub(r'@\w+|\#', '', tweet)  # Remove mentions and hashtags
    tweet = re.sub(r'[^\w\s]', '', tweet)  # Remove punctuation and special characters
    
    # 3. Tokenization (split the tweet into words)
    words = tweet.split()
    
    # 4. Remove stopwords and apply lemmatization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    # Join the words back into a single string
    return ' '.join(words)

# Apply the preprocessing to the 'tweet' column
df['processed_tweet'] = df['tweet'].apply(preprocess_tweet)



def replace_spaces_with_commas(emb_str):
    return emb_str.replace(' ', ',')

# Apply the function to the embeddings column
df['embeddings'] = df['embeddings'].apply(replace_spaces_with_commas)

# Function to extract the list from the string representation
def extract_list(emb_str):
    # Remove outer brackets and convert the string to a list
    return eval(emb_str.strip("[]"))

# Apply the function and create a DataFrame from the list of embeddings
embeddings_split = df['embeddings'].apply(extract_list).apply(pd.Series)

# Rename columns to reflect the split embedding positions
embeddings_split.columns = [f'embedding_{i}' for i in range(embeddings_split.shape[1])]

# Concatenate the original DataFrame with the new embeddings DataFrame
df = pd.concat([df, embeddings_split], axis=1)
df = df.drop(['embeddings'], axis=1)



# Extract the tweet text and labels
tweets = df['processed_tweet'].astype(str).values  # Using 'tweet' as the text input
y = df['majority_target']

# Create a TF-IDF vectorizer with a limited vocabulary size
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = vectorizer.fit_transform(tweets)

# Apply TruncatedSVD for dimensionality reduction on TF-IDF features
svd = TruncatedSVD(n_components=100, random_state=42)
tfidf_reduced = svd.fit_transform(tfidf_features)

# Now drop the 'majority_target', 'statement', and 'tweet' columns from df
X_other = df.drop(['majority_target', 'statement', 'tweet'], axis=1)

# Convert all non-numeric columns to numeric, or fill NaN with 0
X_other = X_other.apply(pd.to_numeric, errors='coerce').fillna(0)


# Scale the numerical features using StandardScaler
scaler = StandardScaler()
X_other_scaled = scaler.fit_transform(X_other)  # This will return a dense array




# Convert X_other to a sparse matrix since we will combine it with the sparse TF-IDF matrix
X_other_sparse = csr_matrix(X_other_scaled)

# Combine the TF-IDF reduced features and the other features
X_combined = hstack([X_other_sparse, csr_matrix(tfidf_reduced)])



# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size = 0.3, random_state=42)

# Define the model
rf = RandomForestClassifier(n_estimators=100, max_depth= 10, random_state=42)

# Train the Random Forest on the training data
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report: training set")
print(classification_report(y_train, y_pred_train, target_names=['Fake', 'Real']))

print("\nClassification Report: test set")
print(classification_report(y_test, y_pred, target_names=['Fake', 'Real']))

Random Forest Accuracy: 0.9576

Classification Report: training set
              precision    recall  f1-score   support

        Fake       0.96      0.96      0.96     45771
        Real       0.96      0.96      0.96     48167

    accuracy                           0.96     93938
   macro avg       0.96      0.96      0.96     93938
weighted avg       0.96      0.96      0.96     93938


Classification Report: test set
              precision    recall  f1-score   support

        Fake       0.95      0.96      0.96     19442
        Real       0.96      0.96      0.96     20818

    accuracy                           0.96     40260
   macro avg       0.96      0.96      0.96     40260
weighted avg       0.96      0.96      0.96     40260



In [26]:
from sklearn.preprocessing import StandardScaler

from scipy.sparse import hstack, csr_matrix# Load the dataset
df = pd.read_csv('Features_For_Traditional_ML_Techniques.csv')
# Initialize the lemmatizer and stopwords
df = df.drop('Unnamed: 0', axis=1) #dropping this column

# lemmatizer = WordNetLemmatizer()
# stop_words = set(stopwords.words('english'))

# # Function to preprocess a single tweet
# def preprocess_tweet(tweet):
#     # 1. Convert to lowercase
#     tweet = tweet.lower()
    
#     # 2. Remove URLs, mentions (@username), hashtags, and special characters
#     tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet)  # Remove URLs
#     tweet = re.sub(r'@\w+|\#', '', tweet)  # Remove mentions and hashtags
#     tweet = re.sub(r'[^\w\s]', '', tweet)  # Remove punctuation and special characters
    
#     # 3. Tokenization (split the tweet into words)
#     words = tweet.split()
    
#     # 4. Remove stopwords and apply lemmatization
#     words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
#     # Join the words back into a single string
#     return ' '.join(words)

# # Apply the preprocessing to the 'tweet' column
# df['processed_tweet'] = df['tweet'].apply(preprocess_tweet)


# Extract the tweet text and labels
tweets = df['tweet'].astype(str).values  # Using 'tweet' as the text input
y = df['majority_target']

# Create a TF-IDF vectorizer with a limited vocabulary size
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = vectorizer.fit_transform(tweets)

# # Apply TruncatedSVD for dimensionality reduction on TF-IDF features
# svd = TruncatedSVD(n_components=100, random_state=42)
# tfidf_reduced = svd.fit_transform(tfidf_features)

# Combine the TF-IDF reduced features and the other features
X_combined = hstack([tfidf_features])



# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size = 0.3, random_state=42)

# Define the model
rf = RandomForestClassifier(n_estimators=100, max_depth= 10, random_state=42)

# Train the Random Forest on the training data
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report: training set")
print(classification_report(y_train, y_pred_train, target_names=['Fake', 'Real']))

print("\nClassification Report: test set")
print(classification_report(y_test, y_pred, target_names=['Fake', 'Real']))

Random Forest Accuracy: 0.8160

Classification Report: training set
              precision    recall  f1-score   support

        Fake       0.91      0.70      0.79     45771
        Real       0.77      0.93      0.84     48167

    accuracy                           0.82     93938
   macro avg       0.84      0.82      0.82     93938
weighted avg       0.84      0.82      0.82     93938


Classification Report: test set
              precision    recall  f1-score   support

        Fake       0.90      0.70      0.79     19442
        Real       0.77      0.93      0.84     20818

    accuracy                           0.82     40260
   macro avg       0.83      0.81      0.81     40260
weighted avg       0.83      0.82      0.81     40260



In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('Features_For_Traditional_ML_Techniques.csv')

# Drop the unnecessary column
df = df.drop('Unnamed: 0', axis=1)

# Function to replace spaces with commas in the 'embeddings' column
def replace_spaces_with_commas(emb_str):
    return emb_str.replace(' ', ',')

# Apply the function to the embeddings column
df['embeddings'] = df['embeddings'].apply(replace_spaces_with_commas)

# Function to extract list from the string representation of embeddings
def extract_list(emb_str):
    return eval(emb_str.strip("[]"))

# Apply the function and create a DataFrame from the list of embeddings
embeddings_split = df['embeddings'].apply(extract_list).apply(pd.Series)

# Rename columns to reflect the split embedding positions
embeddings_split.columns = [f'embedding_{i}' for i in range(embeddings_split.shape[1])]

# Concatenate the original DataFrame with the new embeddings DataFrame
df = pd.concat([df, embeddings_split], axis=1)

# Drop the original embeddings column
df = df.drop(['embeddings'], axis=1)

# Define features (X) and target (y)
y = df['majority_target']
X = df.drop(['majority_target', 'statement', 'tweet'], axis=1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the Random Forest model
rf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42)

# Train the Random Forest on the training data
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.15f}")

# Classification report
print("\nClassification Report: training set")
print(classification_report(y_train, y_pred_train, target_names=['Fake', 'Real']))

print("\nClassification Report: test set")
print(classification_report(y_test, y_pred, target_names=['Fake', 'Real']))


Random Forest Accuracy: 0.957625434674615

Classification Report: training set
              precision    recall  f1-score   support

        Fake       0.96      0.96      0.96     45771
        Real       0.97      0.97      0.97     48167

    accuracy                           0.96     93938
   macro avg       0.96      0.96      0.96     93938
weighted avg       0.96      0.96      0.96     93938


Classification Report: test set
              precision    recall  f1-score   support

        Fake       0.95      0.96      0.96     19442
        Real       0.96      0.96      0.96     20818

    accuracy                           0.96     40260
   macro avg       0.96      0.96      0.96     40260
weighted avg       0.96      0.96      0.96     40260



In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import randint

# Load the dataset
df = pd.read_csv('Features_For_Traditional_ML_Techniques.csv')

# Drop the unnecessary column
df = df.drop('Unnamed: 0', axis=1)

# Function to replace spaces with commas in the 'embeddings' column
def replace_spaces_with_commas(emb_str):
    return emb_str.replace(' ', ',')

# Apply the function to the embeddings column
df['embeddings'] = df['embeddings'].apply(replace_spaces_with_commas)

# Function to extract list from the string representation of embeddings
def extract_list(emb_str):
    return eval(emb_str.strip("[]"))

# Apply the function and create a DataFrame from the list of embeddings
embeddings_split = df['embeddings'].apply(extract_list).apply(pd.Series)

# Rename columns to reflect the split embedding positions
embeddings_split.columns = [f'embedding_{i}' for i in range(embeddings_split.shape[1])]

# Concatenate the original DataFrame with the new embeddings DataFrame
df = pd.concat([df, embeddings_split], axis=1)

# Drop the original embeddings column
df = df.drop(['embeddings'], axis=1)

# Define features (X) and target (y)
y = df['majority_target']
X = df.drop(['majority_target', 'statement', 'tweet'], axis=1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)


# Define the parameter grid for RandomizedSearchCV using scipy's randint distribution
param_dist = {
    'n_estimators': randint(50, 200),  # Number of trees
    'max_depth': randint(10, 50),  # Maximum depth of the tree
    'min_samples_split': randint(2, 20),  # Minimum samples required to split a node
    'min_samples_leaf': randint(1, 20),  # Minimum samples required to be at a leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}


# Create a RandomForestClassifier instance
rf_model = RandomForestClassifier(random_state=42)

# Reduced n_iter and cv for quicker execution
random_search = RandomizedSearchCV(
    rf_model, param_distributions=param_dist, n_iter=6, cv=2, 
    verbose=2, random_state=42, n_jobs=-1
)


# Fit the model with RandomizedSearchCV to find the best parameters
random_search.fit(X_train, y_train)

# Get the best estimator and print the best parameters
best_rf_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

# Make predictions using the best model
y_pred = best_rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
Best Parameters: {'bootstrap': True, 'max_depth': 38, 'min_samples_leaf': 15, 'min_samples_split': 12, 'n_estimators': 121}
Accuracy: 0.957625434674615
Classification Report:
               precision    recall  f1-score   support

       False       0.95      0.96      0.96     19442
        True       0.96      0.96      0.96     20818

    accuracy                           0.96     40260
   macro avg       0.96      0.96      0.96     40260
weighted avg       0.96      0.96      0.96     40260



In [30]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import randint, uniform

# Define the XGBoost model
xgb_model = XGBClassifier(random_state=42)

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(50, 200),  # Number of trees
    'max_depth': randint(3, 10),  # Maximum depth of the tree
    'learning_rate': uniform(0.01, 0.3),  # Learning rate (eta)
    'subsample': uniform(0.6, 0.4),  # Fraction of samples to use per tree
    'colsample_bytree': uniform(0.6, 0.4),  # Fraction of features to use per tree
    'min_child_weight': randint(1, 10),  # Minimum sum of instance weight (Hessian)
    'gamma': uniform(0, 0.5),  # Minimum loss reduction required to make a further partition
}

# Create a RandomizedSearchCV instance
random_search_xgb = RandomizedSearchCV(
    xgb_model, param_distributions=param_dist, n_iter=20, cv=3,
    verbose=2, random_state=42, n_jobs=-1
)

# Fit the model with RandomizedSearchCV
random_search_xgb.fit(X_train, y_train)

# Get the best estimator and print the best parameters
best_xgb_model = random_search_xgb.best_estimator_
print("Best Parameters for XGBoost:", random_search_xgb.best_params_)

# Make predictions using the best model
y_pred_xgb = best_xgb_model.predict(X_test)

# Evaluate the model
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters for XGBoost: {'colsample_bytree': 0.6399899663272012, 'gamma': 0.22962444598293358, 'learning_rate': 0.11011258334170654, 'max_depth': 5, 'min_child_weight': 6, 'n_estimators': 102, 'subsample': 0.9879639408647978}
XGBoost Accuracy: 0.957625434674615
XGBoost Classification Report:
               precision    recall  f1-score   support

       False       0.95      0.96      0.96     19442
        True       0.96      0.96      0.96     20818

    accuracy                           0.96     40260
   macro avg       0.96      0.96      0.96     40260
weighted avg       0.96      0.96      0.96     40260



In [31]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Define the base models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(random_state=42)

# Define the meta-model
meta_model = LogisticRegression()

# Combine the base models and meta-model in a StackingClassifier
stacked_model = StackingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgb_model)],
    final_estimator=meta_model,  # Meta-model
    cv=5,  # 5-fold cross-validation to fit meta-model
    n_jobs=-1
)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit the stacked model
stacked_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = stacked_model.predict(X_test)

# Evaluate the model
print("Stacked Model Accuracy:", accuracy_score(y_test, y_pred))
print("Stacked Model Classification Report:\n", classification_report(y_test, y_pred))


Stacked Model Accuracy: 0.957625434674615
Stacked Model Classification Report:
               precision    recall  f1-score   support

       False       0.95      0.96      0.96     19442
        True       0.96      0.96      0.96     20818

    accuracy                           0.96     40260
   macro avg       0.96      0.96      0.96     40260
weighted avg       0.96      0.96      0.96     40260

