In [1]:
import pandas as pd
import numpy as np
train = pd.read_csv('data/si670_kaggle1_train.csv')
valid = pd.read_csv('data/si670_kaggle1_validation.csv')
test = pd.read_csv('data/test.csv')

In [2]:
# 字符数
train['char_length'] = train['text'].apply(len)
# 词数（英文可直接split，中文需先分词）
train['word_count'] = train['text'].str.split().apply(len)   # 英文适用
valid['char_length'] = valid['text'].apply(len)
valid['word_count'] = valid['text'].str.split().apply(len)
test['char_length'] = test['text'].apply(len)
test['word_count'] = test['text'].str.split().apply(len)

In [3]:
train['avg_word_len'] = train['text'].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)
valid['avg_word_len'] = valid['text'].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)
test['avg_word_len'] = test['text'].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)

In [5]:
import re

def avg_sent_len(text):
    # 按句子断开
    sentences = re.split(r'[.!?]', text)
    sentences = [s.strip() for s in sentences if s.strip()]  # 去掉空句
    if not sentences:
        return 0
    return np.mean([len(s.split()) for s in sentences])

train['avg_sent_len'] = train['text'].apply(avg_sent_len)
valid['avg_sent_len'] = valid['text'].apply(avg_sent_len)
test['avg_sent_len'] = test['text'].apply(avg_sent_len)

In [6]:
def punctuation_freq(text, punc):
    return text.count(punc) / len(text) if len(text)>0 else 0

train['comma_ratio'] = train['text'].apply(lambda x: punctuation_freq(x, ','))
train['period_ratio'] = train['text'].apply(lambda x: punctuation_freq(x, '.'))
train['question_ratio'] = train['text'].apply(lambda x: punctuation_freq(x, '?'))
train['exclamation_ratio'] = train['text'].apply(lambda x: punctuation_freq(x, '!'))
valid['comma_ratio'] = valid['text'].apply(lambda x: punctuation_freq(x, ','))
valid['period_ratio'] = valid['text'].apply(lambda x: punctuation_freq(x, '.'))
valid['question_ratio'] = valid['text'].apply(lambda x: punctuation_freq(x, '?'))
valid['exclamation_ratio'] = valid['text'].apply(lambda x: punctuation_freq(x, '!'))
test['comma_ratio'] = test['text'].apply(lambda x: punctuation_freq(x, ','))
test['period_ratio'] = test['text'].apply(lambda x: punctuation_freq(x, '.'))
test['question_ratio'] = test['text'].apply(lambda x: punctuation_freq(x, '?'))
test['exclamation_ratio'] = test['text'].apply(lambda x: punctuation_freq(x, '!'))

In [8]:
def bigram_unique_ratio(text):
    words = text.split()
    bigrams = [tuple(words[i:i+2]) for i in range(len(words)-1)]
    if not bigrams:
        return 0
    return len(set(bigrams)) / len(bigrams)

def trigram_unique_ratio(text):
    words = text.split()
    bigrams = [tuple(words[i:i+3]) for i in range(len(words)-1)]
    if not bigrams:
        return 0
    return len(set(bigrams)) / len(bigrams)

train['bigram_unique_ratio'] = train['text'].apply(bigram_unique_ratio)
train['trigram_unique_ratio'] = train['text'].apply(trigram_unique_ratio)
valid['bigram_unique_ratio'] = valid['text'].apply(bigram_unique_ratio)
valid['trigram_unique_ratio'] = valid['text'].apply(trigram_unique_ratio)
test['bigram_unique_ratio'] = test['text'].apply(bigram_unique_ratio)
test['trigram_unique_ratio'] = test['text'].apply(trigram_unique_ratio)


In [9]:
from collections import Counter

def shannon_entropy(text):
    words = text.split()
    if not words:
        return 0
    counts = np.array(list(Counter(words).values()))
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

train['shannon_entropy'] = train['text'].apply(shannon_entropy)
valid['shannon_entropy'] = valid['text'].apply(shannon_entropy)
test['shannon_entropy'] = test['text'].apply(shannon_entropy)

In [12]:
train.head()

Unnamed: 0,text,label,id,char_length,word_count,avg_word_len,avg_sent_len,comma_ratio,period_ratio,question_ratio,exclamation_ratio,bigram_unique_ratio,trigram_unique_ratio,shannon_entropy
0,White girls very rarely date Asian men. Even i...,1,0,1238,222,4.581081,12.166667,0.008078,0.011309,0.000808,0.002423,0.963801,0.995475,7.041116
1,I am a 23 year old male Indian American male. ...,1,1,1364,265,4.150943,11.909091,0.005132,0.013196,0.000733,0.002199,0.939394,0.988636,7.010302
2,"Take three people, Persons A, B, and C. They l...",1,2,1628,303,4.376238,10.033333,0.007985,0.014742,0.001229,0.002457,0.943709,0.986755,7.150306
3,(A) Work part-time in high school; Then go to ...,1,3,2269,405,4.604938,13.433333,0.008814,0.011459,0.000441,0.001322,0.945545,0.99505,7.455256
4,When police introduce a new form of speed prev...,1,4,1240,212,4.853774,21.2,0.008065,0.008065,0.0,0.0,0.952607,1.0,6.658566


In [13]:
features_for_model = ['char_length', 'word_count', 'avg_word_len', 'avg_sent_len',
                      'comma_ratio', 'period_ratio', 'question_ratio', 'exclamation_ratio',
                      'bigram_unique_ratio', 'trigram_unique_ratio', 'shannon_entropy']
X_train = train[features_for_model]
y_train = train['label']
X_valid = valid[features_for_model]
y_valid = valid['label']
X_test = test[features_for_model]

In [20]:
#standardize
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

In [30]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
model = LogisticRegression(penalty='l2',class_weight='balanced', max_iter=100, C=0.1, random_state=670, solver='liblinear')
model.fit(X_train, y_train)
y_valid_pred = model.predict_proba(X_valid)
y_pred = (y_valid_pred[:, 1] >= 0.5).astype(int)
print('F1分数:', f1_score(y_valid, y_pred))

F1分数: 0.7007560262412315


In [34]:
y_pred = (y_valid_pred[:, 1] >= 0.45).astype(int)
print('F1分数:', f1_score(y_valid, y_pred))

F1分数: 0.705292723149866
