In [None]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
combined_df = pd.read_csv('Cleaned_Merged_Review_Dataset.csv')
print(combined_df.head())  # Shows the first 5 rows
print(combined_df.info())  # Shows a summary of the data

Saving Cleaned_Merged_Review_Dataset.csv to Cleaned_Merged_Review_Dataset (2).csv
                                         review_text label
0  love well made sturdy comfortable love itvery ...  fake
1   love great upgrade original ive mine couple year  fake
2            pillow saved back love look feel pillow  fake
3        missing information use great product price  fake
4                nice set good quality set two month  fake
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41404 entries, 0 to 41403
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_text  41404 non-null  object
 1   label        41404 non-null  object
dtypes: object(2)
memory usage: 647.1+ KB
None


In [None]:
import joblib
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.svm import SVC

# Split the dataset into training and testing sets
X = combined_df['review_text']
y = combined_df['label'].apply(lambda x: 1 if x == 'real' else 0)  # Encode labels: 1 for real, 0 for fake
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.isna().sum())
print(X_test.isna().sum())
print(y_train.isna().sum())
print(y_test.isna().sum())

print(X_train.shape[0])
print(y_train.shape[0])

0
0
0
0
33123
33123


In [None]:
# Count the number of 'fake' and 'real' labels in the dataset
label_counts = combined_df['label'].value_counts()
label_counts


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
fake,20966
real,20438


In [None]:
# Check label distribution in training and testing sets
print("Training set label distribution:")
print(y_train.value_counts(normalize=True))  # Show proportions of each class

print("\nTesting set label distribution:")
print(y_test.value_counts(normalize=True))   # Show proportions of each class



Training set label distribution:
label
0    0.508378
1    0.491622
Name: proportion, dtype: float64

Testing set label distribution:
label
1    0.50163
0    0.49837
Name: proportion, dtype: float64


In [None]:
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Transform the text data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Save the fitted TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
print("Vectorizer saved as 'tfidf_vectorizer.joblib'.")


# Initialize Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
# Initialize Naive Bayes model
nb = MultinomialNB()
# Initialize Support Vector Machine (SVM) model with a linear kernel and a lower value of C
svm_model = SVC(kernel='linear', C=0.5, probability=True, random_state=42)


from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Train and test Logistic Regression model
log_reg.fit(X_train_tfidf, y_train)
log_reg_preds = log_reg.predict(X_test_tfidf)

# Evaluate Logistic Regression model
log_reg_accuracy = accuracy_score(y_test, log_reg_preds)
log_reg_auc = roc_auc_score(y_test, log_reg.predict_proba(X_test_tfidf)[:, 1])
log_reg_precision = precision_score(y_test, log_reg_preds)
log_reg_recall = recall_score(y_test, log_reg_preds)
log_reg_f1 = f1_score(y_test, log_reg_preds)
log_reg_conf_matrix = confusion_matrix(y_test, log_reg_preds)

# Train and test Naive Bayes model
nb.fit(X_train_tfidf, y_train)
nb_preds = nb.predict(X_test_tfidf)

# Evaluate Naive Bayes model
nb_accuracy = accuracy_score(y_test, nb_preds)
nb_auc = roc_auc_score(y_test, nb.predict_proba(X_test_tfidf)[:, 1])
nb_precision = precision_score(y_test, nb_preds)
nb_recall = recall_score(y_test, nb_preds)
nb_f1 = f1_score(y_test, nb_preds)
nb_conf_matrix = confusion_matrix(y_test, nb_preds)

# Train and test SVM model
svm_model.fit(X_train_tfidf, y_train)
svm_preds = svm_model.predict(X_test_tfidf)

# Evaluate SVM model
svm_accuracy = accuracy_score(y_test, svm_preds)
svm_auc = roc_auc_score(y_test, svm_model.predict_proba(X_test_tfidf)[:, 1])
svm_precision = precision_score(y_test, svm_preds)
svm_recall = recall_score(y_test, svm_preds)
svm_f1 = f1_score(y_test, svm_preds)
svm_conf_matrix = confusion_matrix(y_test, svm_preds)


from sklearn.calibration import CalibratedClassifierCV

# Calibrate probabilities with cross-validation
calibrated_svm = CalibratedClassifierCV(svm_model, cv="prefit")
calibrated_svm.fit(X_train_tfidf, y_train)

# Evaluate calibrated SVM model
calibrated_svm_preds = calibrated_svm.predict(X_test_tfidf)
calibrated_svm_accuracy = accuracy_score(y_test, calibrated_svm_preds)
calibrated_svm_auc = roc_auc_score(y_test, calibrated_svm.predict_proba(X_test_tfidf)[:, 1])
calibrated_svm_precision = precision_score(y_test, calibrated_svm_preds)
calibrated_svm_recall = recall_score(y_test, calibrated_svm_preds)
calibrated_svm_f1 = f1_score(y_test, calibrated_svm_preds)
calibrated_svm_conf_matrix = confusion_matrix(y_test, calibrated_svm_preds)


# Save the trained Logistic Regression model
joblib.dump(log_reg, 'log_reg.joblib')
print("Model saved as 'log_reg.joblib'.")

# Save the calibrated model
joblib.dump(calibrated_svm, 'calibrated_svm_model.joblib')

# Save the trained SVM model
joblib.dump(svm_model, 'svm_model.joblib')
print("Model saved as 'svm_model.joblib'.")

# Save SVM model parameters
svm_params = {
    "kernel": "linear",
    "probability": True,
    "random_state": 42
}
with open('svm_params.json', 'w') as f:
    json.dump(svm_params, f)
print("Model parameters saved as 'svm_params.json'.")


# Compile results
results = {
    "Model": ["Logistic Regression", "Naive Bayes", "SVM", "Calibrate SVM"],
    "Accuracy": [log_reg_accuracy, nb_accuracy, svm_accuracy, calibrated_svm_accuracy],
    "AUC-ROC": [log_reg_auc, nb_auc, svm_auc, calibrated_svm_auc],
    "Precision": [log_reg_precision, nb_precision, svm_precision, calibrated_svm_precision],
    "Recall": [log_reg_recall, nb_recall, svm_recall, calibrated_svm_recall],
    "F1 Score": [log_reg_f1, nb_f1, svm_f1, calibrated_svm_f1],
    "Confusion matrix": [log_reg_conf_matrix, nb_conf_matrix, svm_conf_matrix, calibrated_svm_conf_matrix]
}

# Display results
results_df = pd.DataFrame(results)
results_df

Vectorizer saved as 'tfidf_vectorizer.joblib'.
Model saved as 'log_reg.joblib'.
Model saved as 'svm_model.joblib'.
Model parameters saved as 'svm_params.json'.


Unnamed: 0,Model,Accuracy,AUC-ROC,Precision,Recall,F1 Score,Confusion matrix
0,Logistic Regression,0.858471,0.936608,0.848854,0.873375,0.86094,"[[3481, 646], [526, 3628]]"
1,Naive Bayes,0.841927,0.91825,0.855358,0.824266,0.839524,"[[3548, 579], [730, 3424]]"
2,SVM,0.864268,0.938226,0.858325,0.873616,0.865903,"[[3528, 599], [525, 3629]]"
3,Calibrate SVM,0.863181,0.938223,0.860072,0.86856,0.864295,"[[3540, 587], [546, 3608]]"


In [None]:
# Save the trained SVM model
joblib.dump(svm_model, 'svm_model.joblib')

# Save hyperparameters separately
svm_params = {
    "kernel": "linear",
    "probability": True,
    "random_state": 42
}

with open('svm_params.json', 'w') as f:
    json.dump(svm_params, f)


In [None]:
import os

# List files in the current working directory
print(os.listdir())


In [None]:
!pip install transformers torch datasets


In [None]:
!pip install datasets

In [None]:
from google.colab import files
uploaded = files.upload()  # This will prompt you to upload your file

from transformers import DistilBertTokenizer
from datasets import load_dataset


# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Load the local dataset file
dataset = load_dataset("csv", data_files="Manually_Cleaned_Merged_Reviews_Dataset.csv")
# Remove rows with missing values in the 'review_text' column
original_dataset = dataset["train"].filter(lambda x: x["review_text"] is not None)

# Now split and proceed with the rest of your code
dataset = original_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

# Define the preprocessing function
def preprocess_data(examples):
    return tokenizer(examples["review_text"], truncation=True, padding="max_length")

# Tokenize the training and testing sets
train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


In [None]:
import os
from transformers import TrainingArguments, Trainer, DistilBertForSequenceClassification

# Disable Weights & Biases logging
os.environ["WANDB_DISABLED"] = "true"

# Load the model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)



In [None]:
print(train_dataset.column_names)
print(test_dataset.column_names)


# Define the mapping function to convert labels to integers
def convert_labels_to_int(example):
    example["label"] = 1 if example["label"] == "real" else 0
    return example

print(train_dataset.head)
print(test_dataset.head)

# Apply the conversion to both train and test datasets without caching
train_dataset = train_dataset.map(convert_labels_to_int, load_from_cache_file=False)
test_dataset = test_dataset.map(convert_labels_to_int, load_from_cache_file=False)

# Set format for input columns only (do not include 'label')
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

print(train_dataset)
print(test_dataset)

# # Verify that labels are now integers (not tensors)
# print(train_dataset[0]["label"], type(train_dataset[0]["label"]))
# print(test_dataset[0]["label"], type(test_dataset[0]["label"]))


In [None]:
# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()