## 1. Read dataset, train, validate, test

In [2]:
import pandas as pd
train = pd.read_csv('data/si670_kaggle1_train.csv')
valid = pd.read_csv('data/si670_kaggle1_validation.csv')
test = pd.read_csv('data/test.csv')

In [3]:
test.shape

(60743, 2)

In [4]:
train.shape

(319071, 3)

In [5]:
ratio_train = train['label'].mean()
ratio_train

0.2924678206418007

In [6]:
ratio_valid = valid['label'].mean()
ratio_valid

0.5070960698689956

Check for nulls

In [7]:
train.isnull().sum()

text     0
label    0
id       0
dtype: int64

seems like there are no nulls

## 2. Data processing/ text processing

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df= 0.00005)
tfidf_train = vectorizer.fit_transform(train['text'])
y_train = train['label']
tfidf_valid = vectorizer.transform(valid['text'])
y_valid = valid['label']
tfidf_test = vectorizer.transform(test['text'])

tfidf_train.shape

(319071, 439151)

try separate 1-gram and 2-gram

In [57]:
vectorizer_uni = TfidfVectorizer(
    ngram_range=(1,1),
    min_df=0.00005,
    max_df=0.8,
    stop_words='english'
)

tfidf_train_uni = vectorizer_uni.fit_transform(train['text'])
tfidf_valid_uni = vectorizer_uni.transform(valid['text'])
tfidf_test_uni = vectorizer_uni.transform(test['text'])

In [36]:
vectorizer_bi = TfidfVectorizer(
    ngram_range=(2,2),
    min_df=0.00002,
    max_df=0.8,
    stop_words='english'
)

tfidf_train_bi = vectorizer_bi.fit_transform(train['text'])
tfidf_valid_bi = vectorizer_bi.transform(valid['text'])
tfidf_test_bi = vectorizer_bi.transform(test['text'])

In [20]:
# vectorizer_tri = TfidfVectorizer(
#     ngram_range=(3,3),
#     min_df=0.00005,
#     max_df=0.8,
#     stop_words='english'
# )
# 
# tfidf_train_tri = vectorizer_bi.fit_transform(train['text'])
# tfidf_valid_tri = vectorizer_bi.transform(valid['text'])
# tfidf_test_tri = vectorizer_bi.transform(test['text'])

In [58]:
from scipy.sparse import hstack
tfidf_train_all = hstack([tfidf_train_uni, tfidf_train_bi])
tfidf_valid_all = hstack([tfidf_valid_uni, tfidf_valid_bi])
tfidf_test_all = hstack([tfidf_test_uni, tfidf_test_bi])
tfidf_train_all.shape

(319071, 617596)

train on all

In [85]:
df_all = pd.concat([train, valid], axis=0, ignore_index=True)
X_all_text = df_all['text']
y_all = df_all['label']
# tfidf_all_1 = vectorizer_bi.fit_transform(X_all_text)
# tfidf_all_2 = vectorizer_uni.fit_transform(X_all_text)
# tfidf_test_1 = vectorizer_bi.transform(test['text'])
# tfidf_test_2 = vectorizer_uni.transform(test['text'])
# tfidf_all = hstack([tfidf_all_1, tfidf_all_2])
# tfidf_test_alll = hstack([tfidf_test_1, tfidf_test_2])
vectorizer_all = TfidfVectorizer(ngram_range=(1,2), min_df= 0.00005)
tfidf_all = vectorizer_all.fit_transform(X_all_text)

In [86]:
tfidf_valid = vectorizer_all.transform(valid['text'])
y_valid = valid['label']
tfidf_test = vectorizer_all.transform(test['text'])

In [78]:
tfidf_all.shape

(375863, 632690)

How to deal with high dimensionality?

In [15]:
# vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.8, stop_words='english')
# tfidf_train = vectorizer.fit_transform(train['text'])
# tfidf_train.shape

In [16]:
# from sklearn.decomposition import TruncatedSVD
# 
# svd = TruncatedSVD(n_components=1000, random_state=670)
# tfidf_train_svd = svd.fit_transform(tfidf_train)
# tfidf_valid_svd = svd.transform(tfidf_valid)

In [17]:
# tfidf_train_svd.shape

## 3. Train on Logistic Regression

In [87]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler

# ros = RandomOverSampler(random_state=670)
# X_resampled_train, y_resampled = ros.fit_resample(tfidf_train_all, y_train)


model = LogisticRegression(penalty='l2',class_weight='balanced', max_iter=100, C=20, random_state=670, solver='liblinear')

#model.fit(tfidf_train, y_train)
model.fit(tfidf_all, y_all)
y_valid_pred_probas = model.predict_proba(tfidf_valid)

#model.fit(tfidf_train_all, y_train)

# model.fit(X_resampled_train, y_resampled)
#y_valid_pred_probas = model.predict_proba(tfidf_valid_all)

# probas[:, 1] 是预测为1的概率
y_pred = (y_valid_pred_probas[:, 1] >= 0.5).astype(int)
print('F1分数:', f1_score(y_valid, y_pred))

F1分数: 0.9943469412603712


In [88]:
y_pred = (y_valid_pred_probas[:, 1] >= 0.34).astype(int)
print('F1分数:', f1_score(y_valid, y_pred))

F1分数: 0.9888128522917277


In [20]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import f1_score
# 
# param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
# lr = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.8, max_iter=1000)
# 
# y_valid = valid['label']
# y_train = train['label']
# 
# # GridSearchCV
# grid = GridSearchCV(lr, param_grid, cv=5, scoring='f1')
# grid.fit(tfidf_train, y_train)
# 
# print("best para：", grid.best_params_)
# 
# # validate
# best_model = grid.best_estimator_
# y_val_pred = best_model.predict(tfidf_valid)
# 
# print("验证集F1分数：", f1_score(tfidf_valid, y_val_pred))

In [89]:
test_pred_probas = model.predict_proba(tfidf_test)
# probas[:, 1] 是预测为1的概率
test_pred = (test_pred_probas[:, 1] >= 0.34).astype(int)

# 4. 生成并导出结果
out_df = pd.DataFrame({'id': test['id'], 'label': test_pred})
out_df.to_csv('test_pred.csv', index=False)