## 1. Read dataset, train, validate, test

In [1]:
import pandas as pd
train = pd.read_csv('data/si670_kaggle1_train.csv')
valid = pd.read_csv('data/si670_kaggle1_validation.csv')
test = pd.read_csv('data/test.csv')

In [16]:
test.shape

(60743, 2)

In [71]:
train.shape

(319071, 3)

In [73]:
ratio_train = train['label'].mean()
ratio_train

0.2924678206418007

In [74]:
ratio_valid = valid['label'].mean()
ratio_valid

0.5070960698689956

Check for nulls

In [3]:
train.isnull().sum()

text     0
label    0
id       0
dtype: int64

seems like there are no nulls

## 2. Data processing/ text processing

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df= 5, max_df=0.8, stop_words='english')
tfidf_train = vectorizer.fit_transform(train['text'])
y_train = train['label']
tfidf_valid = vectorizer.transform(valid['text'])
y_valid = valid['label']
tfidf_test = vectorizer.transform(test['text'])

tfidf_train.shape

(319071, 941086)

How to deal with high dimensionality?

In [5]:
# vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.8, stop_words='english')
# tfidf_train = vectorizer.fit_transform(train['text'])
# tfidf_train.shape

In [15]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=1000, random_state=670)
tfidf_train_svd = svd.fit_transform(tfidf_train)
tfidf_valid_svd = svd.transform(tfidf_valid)

In [54]:
tfidf_train_svd.shape

(319071, 1000)

## 3. Train on Logistic Regression

In [102]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

model = LogisticRegression(penalty='l1',class_weight='balanced', max_iter= 100, C= 10, solver='saga', random_state=670)
model.fit(tfidf_train, y_train)
y_valid_pred_probas = model.predict_proba(tfidf_valid)
# probas[:, 1] 是预测为1的概率
y_pred = (y_valid_pred_probas[:, 1] >= 0.5).astype(int)
print('F1分数:', f1_score(y_valid, y_pred))

F1分数: 0.7327827253479012




In [116]:
y_pred = (y_valid_pred_probas[:, 1] >= 0.2).astype(int)
print('F1分数:', f1_score(y_valid, y_pred))

F1分数: 0.7474085256272015


In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import f1_score
# 
# param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
# lr = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.8, max_iter=1000)
# 
# y_valid = valid['label']
# y_train = train['label']
# 
# # GridSearchCV
# grid = GridSearchCV(lr, param_grid, cv=5, scoring='f1')
# grid.fit(tfidf_train, y_train)
# 
# print("best para：", grid.best_params_)
# 
# # validate
# best_model = grid.best_estimator_
# y_val_pred = best_model.predict(tfidf_valid)
# 
# print("验证集F1分数：", f1_score(tfidf_valid, y_val_pred))

In [117]:
test_pred_probas = model.predict_proba(tfidf_test)
# probas[:, 1] 是预测为1的概率
test_pred = (test_pred_probas[:, 1] >= 0.2).astype(int)

# 4. 生成并导出结果
out_df = pd.DataFrame({'id': test['id'], 'label': test_pred})
out_df.to_csv('test_pred.csv', index=False)