In [2]:
import pandas as pd
train = pd.read_csv("train_drcat_04.csv")

In [3]:
train.rename(columns = {'essay_id': 'id',
                        'label' : 'generated',
                        'prompt': 'prompt_id'}, inplace=True)
train['prompt_id'] = pd.factorize(train['prompt_id'])[0]

In [4]:
train = train[['id', 'prompt_id', 'text', 'generated']]
train

Unnamed: 0,id,prompt_id,text,generated
0,E897534557AF,0,"In recent years, technology has had a profoun...",1
1,DFBA34FFE11D,-1,Should students participate in an extracurricu...,0
2,af37ecf5,-1,The electoral college is a symbol of mockery a...,0
3,5EC2696BAD78,-1,This is why I think the principle should allow...,0
4,llama_70b_v1843,1,I strongly believe that meditation and mindful...,1
...,...,...,...,...
44201,F7341069C4A4,-1,"""Oh man I didn't make the soccer team!"", yelle...",0
44202,AFE6E553DAC2,-1,I believe that using this technology could be ...,0
44203,falcon_180b_v1_600,92,The Face on Mars is a fascinating phenomenon t...,1
44204,A5F84C104693,-1,Texting & Driving\n\nUsing your phone while dr...,0


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(train["text"])

y_train = train["generated"]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define a list of possible values for C
C_list = [0.01, 0.1, 1, 10, 100]

# Initialize the best accuracy and the best C
best_acc = 0
best_C = 0

# Loop over the C values
for C in C_list:
    # Create a logistic regression model with the current C
    model = LogisticRegression(C=C)
    # Fit the model on the train set
    model.fit(X_train, y_train)
    # Predict the labels on the validation set
    y_pred = model.predict(X_val)
    # Compute the accuracy on the validation set
    acc = accuracy_score(y_val, y_pred)
    # Print the accuracy and the C value
    print(f"Accuracy: {acc}, C: {C}")
    # Update the best accuracy and the best C if needed
    if acc > best_acc:
        best_acc = acc
        best_C = C

# Print the best accuracy and the best C
print(f"Best accuracy: {best_acc}, Best C: {best_C}")

Accuracy: 0.883158013799344, C: 0.01
Accuracy: 0.9704784526637258, C: 0.1
Accuracy: 0.9868793122949893, C: 1
Accuracy: 0.9902725935980092, C: 10
Accuracy: 0.9902725935980092, C: 100
Best accuracy: 0.9902725935980092, Best C: 10


In [8]:
# Create a logistic regression model with the best C
final_model = LogisticRegression(C=best_C)
# Fit the model on the whole train set
final_model.fit(X_train, y_train)

In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predict the labels on the test set
y_pred = final_model.predict(X_test)
# Compute the accuracy on the test set
acc = accuracy_score(y_test, y_pred)
# Print the accuracy
print(f"Accuracy: {acc}")

# Compute the confusion matrix on the test set
cm = confusion_matrix(y_test, y_pred)
# Print the confusion matrix
print(cm)

# Print the classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.990047500565483
[[5896   40]
 [  48 2858]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5936
           1       0.99      0.98      0.98      2906

    accuracy                           0.99      8842
   macro avg       0.99      0.99      0.99      8842
weighted avg       0.99      0.99      0.99      8842



In [10]:
import pickle
pickle.dump(final_model, open('model.pkl','wb'))

In [11]:
pickle.load(open('model.pkl','rb'))

In [12]:
pickle.dump(vectorizer, open('tfidf.pkl','wb'))