## 1st iteration

In [None]:
import pandas as pd
import json

data = []
with open('detection_dataset_1st_iter.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)
df.head()

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF helps to determine the importance of words in the text

vectorizer = TfidfVectorizer(max_features=5000)  # try different values or without it to see how it affects performance
X = vectorizer.fit_transform(df['text'])
y = df['label']  # 0 is real, 1 is generated

In [None]:
# see the vocabulary
vocab = vectorizer.get_feature_names_out()
print(f"Vocabulary size: {len(vocab)}")
print(vocab[:10])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

### Sentence Transformer

In [None]:
# for text understanding
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")  # or paraphrase-MiniLM-L3-v2
X = model.encode(df["text"].tolist(), show_progress_bar=True)
y = df["label"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

## 2nd iteration

## 3rd iteration