In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from lightgbm import LGBMClassifier
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score

%pylab inline

train_df = pd.read_csv('../input/train_set.csv', sep='\t', nrows=None)
test_df = pd.read_csv('../input/test_a.csv', sep='\t', nrows=None)

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


# 数据划分

In [6]:
# hold-out
from sklearn.model_selection import train_test_split

# K折交叉验证
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold

# K折分布保持交叉验证
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

# 时间序列划分方法
from sklearn.model_selection import TimeSeriesSplit

# booststrap 采样
from sklearn.utils import resample

In [4]:
X = np.zeros((20, 5))
Y = np.array([1, 2, 3, 4] * 5)
print(X, Y)

# X = np.zeros((20, 5))
# Y = np.array([1]*5 + [2]*5 + [3]*5 + [4]*5)
# print(X, Y)

(array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]]), array([1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]))


In [7]:
# 直接按照比例拆分
# train_X, val_X, train_y, val_y = train_test_split(X, Y, test_size = 0.2)
# print(train_y, val_y)

# 按照比例 & 标签分布划分
train_X, val_X, train_y, val_y = train_test_split(X, Y, test_size = 0.2, stratify=Y)
print(train_y, val_y)

(array([3, 2, 3, 2, 2, 4, 4, 1, 1, 3, 4, 2, 1, 3, 1, 4]), array([2, 4, 1, 3]))


In [8]:
kf = KFold(n_splits=5)
for train_idx, test_idx, in kf.split(X, Y):
    print(train_idx, test_idx)
    print('Label', Y[test_idx])
    print('')

(array([ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), array([0, 1, 2, 3]))
('Label', array([1, 2, 3, 4]))

(array([ 0,  1,  2,  3,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), array([4, 5, 6, 7]))
('Label', array([1, 2, 3, 4]))

(array([ 0,  1,  2,  3,  4,  5,  6,  7, 12, 13, 14, 15, 16, 17, 18, 19]), array([ 8,  9, 10, 11]))
('Label', array([1, 2, 3, 4]))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 16, 17, 18, 19]), array([12, 13, 14, 15]))
('Label', array([1, 2, 3, 4]))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]), array([16, 17, 18, 19]))
('Label', array([1, 2, 3, 4]))



In [9]:
kf = StratifiedKFold(n_splits=5)
for train_idx, test_idx, in kf.split(X, Y):
    print(train_idx, test_idx)
    print('Label', Y[test_idx])
    print('')

(array([ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), array([0, 1, 2, 3]))
('Label', array([1, 2, 3, 4]))

(array([ 0,  1,  2,  3,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), array([4, 5, 6, 7]))
('Label', array([1, 2, 3, 4]))

(array([ 0,  1,  2,  3,  4,  5,  6,  7, 12, 13, 14, 15, 16, 17, 18, 19]), array([ 8,  9, 10, 11]))
('Label', array([1, 2, 3, 4]))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 16, 17, 18, 19]), array([12, 13, 14, 15]))
('Label', array([1, 2, 3, 4]))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]), array([16, 17, 18, 19]))
('Label', array([1, 2, 3, 4]))



In [11]:
kf = TimeSeriesSplit(n_splits=5)
for train_idx, test_idx, in kf.split(X, Y):
    print(train_idx, test_idx)
    print('Label', Y[test_idx])
    print('')

(array([0, 1, 2, 3, 4]), array([5, 6, 7]))
('Label', array([2, 3, 4]))

(array([0, 1, 2, 3, 4, 5, 6, 7]), array([ 8,  9, 10]))
('Label', array([1, 2, 3]))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), array([11, 12, 13]))
('Label', array([4, 1, 2]))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13]), array([14, 15, 16]))
('Label', array([3, 4, 1]))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]), array([17, 18, 19]))
('Label', array([2, 3, 4]))



In [10]:
train_X, train_Y = resample(X, Y, n_samples=16)
val_X, val_Y = resample(X, Y, n_samples=4)
print(train_Y, val_Y)

(array([3, 3, 1, 4, 2, 4, 4, 3, 3, 2, 3, 3, 4, 1, 2, 3]), array([1, 3, 1, 1]))


# fasttext

In [12]:
train_df = pd.read_csv('../input/train_set.csv', sep='\t', nrows=None)
train_df['label_ft'] = '__label__' + train_df['label'].astype(str)
train_df[['text','label_ft']].iloc[:-5000].to_csv('train.csv', index=None, header=None, sep='\t')
train_df[['text','label_ft']].iloc[-5000:].to_csv('valid.csv', index=None, header=None, sep='\t')

In [19]:
import fasttext
model = fasttext.train_supervised(input='train.csv',
                                  autotuneValidationFile='valid.csv', 
                                  autotuneDuration=10)

In [20]:
model.test("valid.csv")

(5000L, 0.924, 0.924)

# 对抗验证

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
train_df = pd.read_csv('../input/train_set.csv', sep='\t', nrows=5000)
test_df = pd.read_csv('../input/test_a.csv', sep='\t', nrows=5000)

tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=500).fit(train_df['text'].iloc[:].values)
train_tfidf = tfidf.transform(train_df['text'].iloc[:].values)
test_tfidf = tfidf.transform(test_df['text'].iloc[:].values)

In [31]:
train_test = np.vstack([train_tfidf.toarray(), test_tfidf.toarray()])

In [56]:
lgb_data = lgb.Dataset(train_test, label=np.array([1]*5000+[0]*5000))

params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.01
params['boosting_type'] = 'gbdt'
params['metric'] = 'auc'

result = lgb.cv(params, lgb_data, num_boost_round=100, nfold=3, verbose_eval=20)

[20]	cv_agg's auc: 0.489245 + 0.00545717
[40]	cv_agg's auc: 0.490305 + 0.00530909
[60]	cv_agg's auc: 0.487145 + 0.00571168
[80]	cv_agg's auc: 0.486393 + 0.00824175
[100]	cv_agg's auc: 0.486985 + 0.00789417


In [57]:
pd.DataFrame(result)

Unnamed: 0,auc-mean,auc-stdv
0,0.484417,0.008703
1,0.486034,0.006219
2,0.487326,0.006154
3,0.490341,0.006879
4,0.489918,0.006201
5,0.488269,0.008389
6,0.487300,0.008922
7,0.488676,0.007211
8,0.487124,0.008878
9,0.486269,0.007008
