In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
df_train = pd.read_csv('../new_data/train_set.csv')
df_test = pd.read_csv('../new_data/test_set.csv')
df_train.head(5)

Unnamed: 0,id,article,word_seg,class
0,0,7368 1252069 365865 755561 1044285 129532 1053...,816903 597526 520477 1179558 1033823 758724 63...,14
1,1,581131 165432 7368 957317 1197553 570900 33659...,90540 816903 441039 816903 569138 816903 10343...,3
2,2,7368 87936 40494 490286 856005 641588 145611 1...,816903 1012629 957974 1033823 328210 947200 65...,12
3,3,299237 760651 299237 887082 159592 556634 7489...,563568 1239563 680125 780219 782805 1033823 19...,13
4,4,7368 7368 7368 865510 7368 396966 995243 37685...,816903 816903 816903 139132 816903 312320 1103...,12


In [4]:
# 数据预处理
df_train.drop(columns=['article', 'id'], inplace=True)
df_test.drop(columns=['article'], inplace=True)
df_train.head(5)

Unnamed: 0,word_seg,class
0,816903 597526 520477 1179558 1033823 758724 63...,14
1,90540 816903 441039 816903 569138 816903 10343...,3
2,816903 1012629 957974 1033823 328210 947200 65...,12
3,563568 1239563 680125 780219 782805 1033823 19...,13
4,816903 816903 816903 139132 816903 312320 1103...,12


In [7]:
# 特征工程
# 将数据集中的字符文本转换成数字向量以便计算机进行处理（一段文字 ---> 一个向量）
vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, max_features=100000)
vectorizer.fit(df_train['word_seg'])
x_train = vectorizer.transform(df_train['word_seg'])
x_test = vectorizer.transform(df_test['word_seg'])
y_train = df_train['class']-1

In [14]:
print(x_train)

  (0, 24)	1
  (0, 81)	1
  (0, 246)	1
  (0, 260)	1
  (0, 329)	1
  (0, 691)	2
  (0, 713)	1
  (0, 1269)	1
  (0, 1712)	1
  (0, 2007)	8
  (0, 2154)	1
  (0, 2631)	1
  (0, 2817)	2
  (0, 2967)	1
  (0, 3036)	1
  (0, 3751)	1
  (0, 3807)	3
  (0, 3869)	1
  (0, 4787)	1
  (0, 4868)	2
  (0, 5588)	1
  (0, 7478)	1
  (0, 7577)	1
  (0, 7635)	1
  (0, 7639)	1
  :	:
  (102276, 94070)	1
  (102276, 94071)	1
  (102276, 94381)	2
  (102276, 95013)	1
  (102276, 95092)	1
  (102276, 95175)	1
  (102276, 95207)	1
  (102276, 95614)	1
  (102276, 95793)	1
  (102276, 96350)	4
  (102276, 96351)	1
  (102276, 96628)	2
  (102276, 96723)	1
  (102276, 96725)	1
  (102276, 96739)	1
  (102276, 96757)	1
  (102276, 97224)	1
  (102276, 97225)	1
  (102276, 97900)	1
  (102276, 98241)	2
  (102276, 98344)	1
  (102276, 98367)	1
  (102276, 99563)	2
  (102276, 99850)	1
  (102276, 99851)	1


In [16]:
df_train.size

204554

In [18]:
x_train.size

49363849

In [19]:
y_train

0         13
1          2
2         11
3         12
4         11
5         12
6          0
7          9
8          9
9         18
10        17
11         6
12         8
13         3
14        16
15         8
16        12
17         9
18         9
19        13
20         9
21         8
22         0
23         1
24        12
25         0
26         6
27        16
28         9
29         7
          ..
102247     8
102248    17
102249    12
102250     8
102251     0
102252    13
102253    11
102254    10
102255    18
102256     1
102257     3
102258     2
102259     5
102260     8
102261     0
102262    17
102263     5
102264     7
102265    15
102266    17
102267    14
102268     2
102269     2
102270     2
102271     7
102272    13
102273     7
102274    11
102275     3
102276    10
Name: class, Length: 102277, dtype: int64

In [20]:
# 训练分类器（线性逻辑回归）
lg = LogisticRegression(C=4, dual=True)
lg.fit(x_train, y_train)



LogisticRegression(C=4, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [21]:
# 预测¶
# 根据上面训练好的分类器对测试集的每个样本进行预测
y_test = lg.predict(x_test)

In [22]:
# 保存
# 将测试集的预测结果保存至本地
df_test['class'] = y_test.tolist()
df_test['class'] = df_test['class'] + 1
df_result = df_test.loc[:, ['id', 'class']]
df_result.to_csv('./result.csv', index = False)