In [149]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import os
import pandas as pd

In [150]:
def extract_article_number(folder_name: str) -> int:
    return int(folder_name.split('_')[1])
def openText(file_path: str) -> str : 
    with open(file_path, "r", encoding="utf-8") as f: 
        return f.read()

In [151]:
baseDir = "train"
labels = pd.read_csv("train.csv")
realTextMap = dict(zip(labels.iloc[:, 0], labels.iloc[:, 1]))

In [152]:
finalTexts = []
finalLabels = []
folders = os.listdir(baseDir)
for folder in folders :
    file1Path = os.path.join(baseDir, folder, "file_1.txt")
    file2Path = os.path.join(baseDir, folder, "file_2.txt")
    docId = extract_article_number(folder)
    text1 = openText(file1Path)
    text2 = openText(file2Path)
    if(realTextMap[docId] == 1) :
        finalTexts.append(text1)
        finalLabels.append("real")
        finalTexts.append(text2)
        finalLabels.append("fake")
    else :
        finalTexts.append(text2)
        finalLabels.append("real")
        finalTexts.append(text1)
        finalLabels.append("fake")

In [153]:
vectorizer = TfidfVectorizer(stop_words='english')
tfIdf_matrix = vectorizer.fit_transform(finalTexts)
X_train, X_test, y_train, y_test = train_test_split(tfIdf_matrix, finalLabels, test_size = 0.4, random_state = 42)
model = LogisticRegression(max_iter = 1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [154]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
# Classification report
print(classification_report(y_test, y_pred))

# Confusion matrix
# cm = confusion_matrix(y_test, y_pred, labels=["real", "fake"])
# sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=["real", "fake"], yticklabels=["real", "fake"])
# plt.xlabel("Predicted")
# plt.ylabel("Actual")
# plt.title("Confusion Matrix")
# plt.show()

              precision    recall  f1-score   support

        fake       0.82      0.36      0.50        39
        real       0.58      0.92      0.71        37

    accuracy                           0.63        76
   macro avg       0.70      0.64      0.60        76
weighted avg       0.70      0.63      0.60        76



In [167]:
# Example text
text = "The President held a press conference today to address rising inflation concerns."

# Transform using the same vectorizer
X_input = vectorizer.transform([text])  # must be a list

# Predict class
predicted_class = model.predict(X_input)[0]
predicted_prob = model.predict_proba(X_input)[0]

# Print results
print(f"Predicted Label: {predicted_class}")
print(f"Probability: {predicted_prob}")


Predicted Label: real
Probability: [0.47995371 0.52004629]


In [155]:
#Results
submission_rows = []
resultTestDir = "test"
resultFolders = sorted(os.listdir(resultTestDir))
for i, folder in enumerate(resultFolders) :
    file1Path = os.path.join(resultTestDir, folder, "file_1.txt")
    file2Path = os.path.join(resultTestDir, folder, "file_2.txt")
    text1 = openText(file1Path)
    text2 = openText(file2Path)
    tfidf_pair = vectorizer.transform([text1, text2])
    proba = model.predict_proba(tfidf_pair)
    prob_real_file1 = proba[0][1]
    prob_real_file2 = proba[1][1]
    real_text_id = 1 if prob_real_file1 > prob_real_file2 else 2
    submission_rows.append({
        "id": i,
        "real_text_id": real_text_id
    })

In [156]:
submission_df = pd.DataFrame(submission_rows)
submission_df.to_csv("submission.csv", index=False)