In [1]:
### word cut,vectorize, and plot tools
import jieba
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib notebook

## Modelling 
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Data Preparation

In [3]:
fname = pd.read_csv('C:/Users/ywan3/Downloads/sqlResult_1558435.csv',encoding='gb18030')

In [4]:
fname.head(5)

Unnamed: 0,id,author,source,content,feature,title,url
0,89617,,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""37""...",小米MIUI 9首批机型曝光：共计15款,http://www.cnbeta.com/articles/tech/623597.htm
1,89616,,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",骁龙835在Windows 10上的性能表现有望改善,http://www.cnbeta.com/articles/tech/623599.htm
2,89615,,快科技@http://www.kkj.cn/,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\r\n...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""18""...",一加手机5细节曝光：3300mAh、充半小时用1天,http://www.cnbeta.com/articles/tech/623601.htm
3,89614,,新华社,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\r\n,"{""type"":""国际新闻"",""site"":""环球"",""commentNum"":""0"",""j...",葡森林火灾造成至少62人死亡 政府宣布进入紧急状态（组图）,http://world.huanqiu.com/hot/2017-06/10866126....
4,89613,胡淑丽_MN7479,深圳大件事,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...,"{""type"":""新闻"",""site"":""网易热门"",""commentNum"":""978"",...",44岁女子约网友被拒暴雨中裸奔 交警为其披衣相随,http://news.163.com/17/0618/00/CN617P3Q0001875...


In [5]:
len(fname)

89611

In [6]:
news_dropna = fname.dropna(subset=['source', 'content'])
len(news_dropna)

87052

In [7]:
def transform(line):
    class_ = 1 if line['source'] == '新华社' else 0
    return pd.Series([class_, line['content']], index=['y', 'content'])

In [8]:
data = news_dropna.apply(transform, axis=1)

In [9]:
data.head()

Unnamed: 0,y,content
0,0,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...
1,0,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...
2,0,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\r\n...
3,1,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\r\n
4,0,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...


In [10]:
corpus = data.content.to_list()
y = data.y.values.astype(np.int)

In [11]:
y.shape, len(corpus)

((87052,), 87052)

### 使用 tfidf 进行文本向量化

In [12]:
corpus_cut = []
mask = []
for sentence in tqdm(corpus):
    if not isinstance(sentence, str):
        mask.append(False)
        continue
    mask.append(True)
    sentence = ''.join(re.findall(r'\w+', sentence))
    corpus_cut.append(' '.join(jieba.cut(sentence)))
len(corpus_cut)

  0%|                                                                                        | 0/87052 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ywan3\AppData\Local\Temp\jieba.cache
Loading model cost 0.790 seconds.
Prefix dict has been built succesfully.
100%|███████████████████████████████████████████████████████████████████████████| 87052/87052 [02:18<00:00, 628.10it/s]


87052

In [13]:
y = y[mask]
len(y)

87052

In [14]:
corpus_cut[1]

'骁龙 835 作为 唯一 通过 Windows10 桌面 平台 认证 的 ARM 处理器 高通 强调 不会 因为 只 考虑 性能 而 去 屏蔽掉 小 核心 相反 他们 正 联手 微软 找到 一种 适合 桌面 平台 的 兼顾 性能 和 功耗 的 完美 方案 报道 称 微软 已经 拿到 了 一些 新 的 源码 以便 Windows10 更好 地 理解 biglittle 架构 资料 显示 骁龙 835 作为 一款 集成 了 CPUGPU 基带 蓝牙 WiFi 的 SoC 比 传统 的 Wintel 方案 可以 节省 至少 30 的 PCB 空间 按计划 今年 Q4 华硕 惠普 联想 将 首发 骁龙 835Win10 电脑 预计 均 是 二合一 形态 的 产品 当然 高通 骁龙 只是 个 开始 未来 也许 还 能 见到 三星 Exynos 联发科 华为 麒麟 小米 澎湃 等 进入 Windows10 桌面 平台'

In [15]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=300)
X = vectorizer.fit_transform(corpus_cut)

In [16]:
X = X.toarray()
X.shape

(87052, 300)

### Building Model

In [17]:
### Set trainning and test set
X_train, x_test, Y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=2019)

#### SVM

In [18]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=2019, test_size=0.15)

In [19]:
svc = SVC(verbose=5)
svc.fit(x_train, y_train)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=5)

In [20]:
y_pred = svc.predict(x_valid)
y_pred_proba = svc.decision_function(x_valid)

In [21]:
svc.score(x_valid, y_valid)

0.9009009009009009

In [22]:
precision_score(y_valid, y_pred)

0.9009009009009009

In [23]:
recall_score(y_valid, y_pred)

1.0

In [24]:
f1_score(y_valid, y_pred)

0.947867298578199

In [25]:
roc_auc_score(y_valid, y_pred_proba)

0.9842204545454545

###### Change weight to “balance”

In [26]:
svc = SVC(class_weight='balanced', verbose=5) 
svc.fit(x_train, y_train)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=5)

In [27]:
y_pred = svc.predict(x_valid) 
y_pred_proba = svc.decision_function(x_valid)

In [28]:
svc.score(x_valid, y_valid)

0.8302702702702702

In [29]:
precision_score(y_valid, y_pred)

0.9997536945812808

In [30]:
recall_score(y_valid, y_pred)

0.8118

In [31]:
f1_score(y_valid, y_pred)

0.8960264900662251

In [32]:
roc_auc_score(y_valid, y_pred_proba)

0.985353090909091

#### Change kernel to linear

In [33]:
svc = SVC(kernel='linear', class_weight='balanced', verbose=5)    
svc.fit(x_train, y_train)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=5)

In [34]:
y_pred = svc.predict(x_valid)   
y_pred_proba = svc.decision_function(x_valid)

In [35]:
svc.score(x_valid, y_valid)

0.9674774774774775

In [36]:
precision_score(y_valid, y_pred)

0.9983455692275877

In [37]:
recall_score(y_valid, y_pred)

0.9655

In [38]:
f1_score(y_valid, y_pred)

0.9816481114330741

In [39]:
roc_auc_score(y_valid, y_pred_proba)

0.9942082727272727

#### work very slow but performs better

In [40]:
svc = SVC(class_weight='balanced', gamma=0.8, verbose=5)   
svc.fit(x_train, y_train)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.8, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=5)

In [41]:
y_pred = svc.predict(x_valid)   
y_pred_proba = svc.decision_function(x_valid)

In [42]:
svc.score(x_valid, y_valid)

0.9790990990990991

In [43]:
precision_score(y_valid, y_pred)

0.9967453213995118

In [44]:
recall_score(y_valid, y_pred)

0.98

In [45]:
f1_score(y_valid, y_pred)

0.9883017345703914

In [46]:
roc_auc_score(y_valid, y_pred_proba)

0.9954502727272727

#### Performs better

### Rondom Forest

In [47]:
rfc = RandomForestClassifier(oob_score=True, class_weight='balanced', verbose=5, random_state=42, n_jobs=4)
rfc.fit(x_train, y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 10building tree 2 of 10building tree 3 of 10building tree 4 of 10



building tree 5 of 10
building tree 6 of 10building tree 7 of 10building tree 8 of 10


building tree 9 of 10


[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:    0.9s remaining:    0.6s


building tree 10 of 10


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    1.2s finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=4, oob_score=True,
                       random_state=42, verbose=5, warm_start=False)

In [48]:
y_pred = rfc.predict(x_valid)   
y_pred_proba = rfc.predict_proba(x_valid)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


In [49]:
rfc.score(x_valid, y_valid)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


0.9905405405405405

In [50]:
precision_score(y_valid, y_pred)

0.9938117576604452

In [51]:
recall_score(y_valid, y_pred)

0.9957

In [52]:
f1_score(y_valid, y_pred)

0.994754982766372

In [53]:
roc_auc_score(y_valid, y_pred_proba[:, 1])

0.9942693181818183

In [54]:
#### increase the number of estimator
rfc = RandomForestClassifier(n_estimators=15, oob_score=True, class_weight='balanced', verbose=5, random_state=42, n_jobs=4)
rfc.fit(x_train, y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 15building tree 2 of 15building tree 3 of 15building tree 4 of 15



building tree 5 of 15
building tree 6 of 15
building tree 7 of 15
building tree 8 of 15
building tree 9 of 15
building tree 10 of 15
building tree 11 of 15
building tree 12 of 15
building tree 13 of 15
building tree 14 of 15
building tree 15 of 15


[Parallel(n_jobs=4)]: Done  12 out of  15 | elapsed:    1.4s remaining:    0.3s
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    1.5s finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=15, n_jobs=4, oob_score=True,
                       random_state=42, verbose=5, warm_start=False)

In [55]:
y_pred = rfc.predict(x_valid)   
y_pred_proba = rfc.predict_proba(x_valid)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  12 out of  15 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  12 out of  15 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.0s finished


In [56]:
rfc.score(x_valid, y_valid)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  12 out of  15 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.0s finished


0.9907207207207207

In [57]:
precision_score(y_valid, y_pred)

0.9932223661915678

In [58]:
recall_score(y_valid, y_pred)

0.9965

In [59]:
f1_score(y_valid, y_pred)

0.9948584835022214

In [60]:
roc_auc_score(y_valid, y_pred_proba[:, 1])

0.9949239545454546

### Naive Bayes

In [61]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=2019, test_size=0.15)

In [62]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [63]:
y_pred = gnb.predict(x_valid)
y_pred_proba = gnb.predict_proba(x_valid)

In [64]:
gnb.score(x_valid, y_valid)

0.8063963963963964

In [65]:
precision_score(y_valid, y_pred)

0.9984761904761905

In [66]:
recall_score(y_valid, y_pred)

0.7863

In [67]:
f1_score(y_valid, y_pred)

0.8797762237762238

In [68]:
roc_auc_score(y_valid, y_pred_proba[:, 1])

0.9410102272727272

### Clustering

In [69]:
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabaz_score

In [70]:
k_means = KMeans()
y_pred = k_means.fit_predict(X_train)

In [71]:
calinski_harabaz_score(X_train, y_pred)

2373.8743793939357

In [None]:
#### We change n_cluster to 2

In [72]:
k_means = KMeans(n_clusters=2, verbose=5, random_state=45, n_jobs=4)
y_pred = k_means.fit_predict(X_train)

In [73]:
calinski_harabaz_score(X_train, y_pred)

5741.110567894496

#### Perform very well.

In [74]:
k_means = KMeans(n_clusters=4, random_state=45, n_jobs=4)
y_pred = k_means.fit_predict(X_train)

In [75]:
calinski_harabaz_score(X_train, y_pred)

3709.9782332247237

####  cluster increases, the performance go down.

各模型的优缺点  

1.Linear Regression

优点：
1.建模速度快，运行速度快
2.模型的可解释性好  

缺点：
1.对异常值敏感
2.无法拟合复杂的非线性关系  

2.Logistic Regression

优点：
1.形式简单，模型可解释性很好，如果某个特征的权重特别大，那么代表这个特征对结果的影响很大，这个特征非常重要
2.模型效果不错，特征工程做的好，在工程上都是可以接受的；特征工程可以大家并行开发，提高开发速度
3.训练速度快，计算量只和特征的数目相关；并且逻辑回归的分布式优化 sgd 发展较为成熟，训练的速度通过分布式优化进一步提高
4.资源占用小，只存储各个特征对应的权重
5.方便调整输出结果，即阈值的调整  

缺点：
1.准确率并不是很高。因为形式非常的简单(非常类似线性模型)，很难去拟合数据的真实分布
2.处理非线性数据较麻烦。逻辑回归在不引入其他方法的情况下，只能处理线性可分的数据，或者进一步说，处理二分类的问题。
3.逻辑回归本身无法筛选特征。有时候，我们会用 gbdt 来筛选特征，然后再上逻辑回归  

2.KNN

优点：
1.理论成熟，思想简单，既可以用来做分类也可以用来做回归
2.可用于非线性分类
3.KNN 理论简单，容易实现  

缺点：
1.样本不平衡问题，效果差
2.需要大量内存
3.对于样本容量大的数据集计算量比较大（体现在距离计算上）
4.KNN每一次分类都会重新进行一次全局运算  

3.SVM

优点：
1.对于高维度数据非常有效
2.当特征数量多余训练数据时，表现依然非常好
3.当类别是完全可分的时候，是最好的算法
4.泛化错误率低
5.计算开销小，虽然循环计算子问题多，但是每个子问题都是解析求解，速度快
6.能够处理非线性特征的相互作用，将输入空间映射到特征空间的过程中可能发生特征之间的组合  

缺点：
1.对参数调节和核函数的选择过敏感
2.对噪声和缺失数据敏感  

4.Naive Bayes

优点：
1.源于古典数学理论，有稳定的分类效率
2.对小规模数据表现较好，能处理多分类任务；适合增量式训练，尤其是数据量超出内存时，可以一批批地去增量训练
3.对缺失数据不敏感  

缺点：
1.在实际应用过程中，属性个数往往较多或者属性之间相关性较大，则过于违背特征独立性假设，导致分类效果不好
2.需要知道先验概率，且先验概率很多时候取决于假设，若假设的模型不合适，则会导致预测效果不佳
3.对输入数据的表达形式很敏感，输入数据的表达形式若较为接近则也会影响特征独立性假设