In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [8]:
def read_csv(filename: str) -> pd.DataFrame:
    return pd.read_csv(filename, sep=',', encoding='latin-1')

train_filename = 'data/ds4420_kaggle_train_data.csv'
test_filename = 'data/ds4420_kaggle_test_data.csv'

train_data = read_csv(train_filename)
test_data = read_csv(test_filename)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(train_data['Text'], train_data['Label'], test_size=0.2, random_state=42)

In [10]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)



In [20]:
model = LogisticRegression(random_state=1).fit(X_train_vec, y_train)
y_train_pred = model.predict(X_train_vec)
y_val_pred = model.predict(X_val_vec)

print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      3261
           1       0.85      0.60      0.70      1476

    accuracy                           0.84      4737
   macro avg       0.84      0.77      0.80      4737
weighted avg       0.84      0.84      0.83      4737



In [21]:
X_test = test_data['Text']
X_test_vec = vectorizer.transform(X_test)
y_pred = model.predict(X_test_vec)
test_data['Label'] = y_pred
test_data = test_data[['ID', 'Label']]
test_data.to_csv('submission.csv', index=False)