# ChatGPT or Not

In [None]:
import string

import numpy as np
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import RocCurveDisplay, auc, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Set Seed

In [None]:
RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)

### Data Importation and Cleaning

In [None]:
df = pd.read_csv(
    "s3://{YOUR_BUCKET}/sentence_level_data.csv",
    index_col=[0],
    storage_options={
        "key": "AWS_ACCESS_KEY",
        "secret": "AWS_SECRET_ACCESS_KEY",
    }
)

In [None]:
def clean_text(s: str) -> str:
    """Clean the text.

    :param s: (str)
    :return: str
    """
    return s.lower().translate(s.maketrans("", "", string.punctuation))

df["cleaned_setence"] = df["sentence"].apply(lambda x: clean_text(x))

In [None]:
nltk.download("wordnet")

lemmer = WordNetLemmatizer()

In [None]:
def lemmatize_text(s: str, lemmer: WordNetLemmatizer) -> str:
    """Lemmatize the text.

    :param s: (str)
    :param stemmer: (PorterStemmer)
    :return: (str)
    """
    return " ".join([lemmer.lemmatize(word) for word in s.split()])

In [None]:
df["lemmatized_text"] = df["cleaned_setence"].apply(lambda x: lemmatize_text(x, lemmer))

In [None]:
tfidf = TfidfVectorizer()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["lemmatized_text"],
    df["class"],
    test_size=0.2,
    random_state=RANDOM_SEED
)

In [None]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))