In [17]:
import jieba
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

import warnings
warnings.filterwarnings('ignore')

%matplotlib notebook

### Data Preparation

In [3]:
fname = pd.read_csv('C:/Users/ywan3/Downloads/sqlResult_1558435.csv',encoding='gb18030')

In [4]:
fname.head(5)

Unnamed: 0,id,author,source,content,feature,title,url
0,89617,,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""37""...",小米MIUI 9首批机型曝光：共计15款,http://www.cnbeta.com/articles/tech/623597.htm
1,89616,,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",骁龙835在Windows 10上的性能表现有望改善,http://www.cnbeta.com/articles/tech/623599.htm
2,89615,,快科技@http://www.kkj.cn/,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\r\n...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""18""...",一加手机5细节曝光：3300mAh、充半小时用1天,http://www.cnbeta.com/articles/tech/623601.htm
3,89614,,新华社,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\r\n,"{""type"":""国际新闻"",""site"":""环球"",""commentNum"":""0"",""j...",葡森林火灾造成至少62人死亡 政府宣布进入紧急状态（组图）,http://world.huanqiu.com/hot/2017-06/10866126....
4,89613,胡淑丽_MN7479,深圳大件事,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...,"{""type"":""新闻"",""site"":""网易热门"",""commentNum"":""978"",...",44岁女子约网友被拒暴雨中裸奔 交警为其披衣相随,http://news.163.com/17/0618/00/CN617P3Q0001875...


In [5]:
len(fname)

89611

In [6]:
news_dropna = fname.dropna(subset=['source', 'content'])
len(news_dropna)

87052

In [7]:
def transform(line):
    class_ = 1 if line['source'] == '新华社' else 0
    return pd.Series([class_, line['content']], index=['y', 'content'])

In [8]:
data = news_dropna.apply(transform, axis=1)

In [9]:
data.head()

Unnamed: 0,y,content
0,0,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...
1,0,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...
2,0,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\r\n...
3,1,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\r\n
4,0,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...


In [10]:
corpus = data.content.to_list()
y = data.y.values.astype(np.int)

In [11]:
y.shape, len(corpus)

((87052,), 87052)

### 使用 tfidf 进行文本向量化

In [12]:
corpus_cut = []
mask = []
for sentence in tqdm(corpus):
    if not isinstance(sentence, str):
        mask.append(False)
        continue
    mask.append(True)
    sentence = ''.join(re.findall(r'\w+', sentence))
    corpus_cut.append(' '.join(jieba.cut(sentence)))
len(corpus_cut)

  0%|                                                                                        | 0/87052 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\ywan3\AppData\Local\Temp\jieba.cache
Loading model cost 0.803 seconds.
Prefix dict has been built succesfully.
100%|███████████████████████████████████████████████████████████████████████████| 87052/87052 [02:49<00:00, 513.89it/s]


87052

In [13]:
y = y[mask]
len(y)

87052

In [14]:
corpus_cut[1]

'骁龙 835 作为 唯一 通过 Windows10 桌面 平台 认证 的 ARM 处理器 高通 强调 不会 因为 只 考虑 性能 而 去 屏蔽掉 小 核心 相反 他们 正 联手 微软 找到 一种 适合 桌面 平台 的 兼顾 性能 和 功耗 的 完美 方案 报道 称 微软 已经 拿到 了 一些 新 的 源码 以便 Windows10 更好 地 理解 biglittle 架构 资料 显示 骁龙 835 作为 一款 集成 了 CPUGPU 基带 蓝牙 WiFi 的 SoC 比 传统 的 Wintel 方案 可以 节省 至少 30 的 PCB 空间 按计划 今年 Q4 华硕 惠普 联想 将 首发 骁龙 835Win10 电脑 预计 均 是 二合一 形态 的 产品 当然 高通 骁龙 只是 个 开始 未来 也许 还 能 见到 三星 Exynos 联发科 华为 麒麟 小米 澎湃 等 进入 Windows10 桌面 平台'

In [15]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=300)
X = vectorizer.fit_transform(corpus_cut)

In [16]:
X = X.toarray()
X.shape

(87052, 300)

### Model Building

In [18]:
random_state = 2019

In [19]:
X_train, x_test, Y_train, y_test = train_test_split(X, y, random_state=random_state, test_size=0.15)

In [20]:
X_train.shape, x_test.shape

((73994, 300), (13058, 300))

### KNN

#### KNeighborsClassifier, k=5

In [21]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=42, test_size=0.15)

In [22]:
x_train.shape, x_valid.shape

((62894, 300), (11100, 300))

In [23]:
knc = KNeighborsClassifier(n_jobs=-1) 
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')

In [24]:
knc.score(x_valid, y_valid)

0.9175675675675675

In [26]:
y_pred = knc.predict(x_valid)
y_pred_prob = knc.predict_proba(x_valid)

In [27]:
y_pred.shape, y_pred_prob.shape

((11100,), (11100, 2))


$$precision=\frac{TP}{TP+FP}$$
$$recall=\frac{TP}{TP+FN}$$
$$\frac{1}{F1}=\frac{1}{2}*(\frac{1}{P}+\frac{1}{R})$$
$$F1 Score=\frac{2PR}{P+R}$$

In [28]:
precision_score(y_valid, y_pred)

0.9552164372631159

In [29]:
recall_score(y_valid, y_pred)

0.9535995220551629

In [30]:
f1_score(y_valid, y_pred)

0.9544072948328267

In [31]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9008588518754408

#### Now I change Kneighbor classifier to k=3

In [33]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=62, test_size=0.15)

knc = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
                     weights='uniform')

In [34]:
y_pred = knc.predict(x_valid)
y_pred_prob = knc.predict_proba(x_valid)

In [35]:
knc.score(x_valid, y_valid)

0.9307207207207208

In [36]:
recall_score(y_valid, y_pred)

0.9696364362764682

In [37]:
f1_score(y_valid, y_pred)

0.9619024027743374

In [38]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.8839187552143312

#### From above, I find precision and recall are increased but roc decreased.

#### Now, I change Kneighbor classifer K=7

In [40]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=62, test_size=0.15)

In [41]:
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1)
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
                     weights='uniform')

In [42]:
y_pred = knc.predict(x_valid)
y_pred_prob = knc.predict_proba(x_valid)

In [43]:
y_pred.shape, y_pred_prob.shape

((11100,), (11100, 2))

In [44]:
knc.score(x_valid, y_valid)

0.9181981981981981

In [45]:
precision_score(y_valid, y_pred)

0.9538384845463609

In [46]:
recall_score(y_valid, y_pred)

0.9555533359968038

In [47]:
f1_score(y_valid, y_pred)

0.9546951402055682

In [48]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9183032750405395

#### Use "distance" as weight

In [49]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=72, test_size=0.15)
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1, weights='distance')
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
                     weights='distance')

In [50]:
y_pred = knc.predict(x_valid)
y_pred_prob = knc.predict_proba(x_valid)

In [51]:
y_pred.shape, y_pred_prob.shape

((11100,), (11100, 2))

In [52]:
knc.score(x_valid, y_valid)

0.9277477477477477

In [53]:
precision_score(y_valid, y_pred)

0.958291956305859

In [54]:
recall_score(y_valid, y_pred)

0.9619218500797448

In [55]:
f1_score(y_valid, y_pred)

0.9601034722913143

In [56]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9307603776529338

#### From above, there is big improvement of "diatance" weight

#### Now I will try to improve leafsize

In [57]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=92, test_size=0.15)
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1, weights='distance', leaf_size=50)
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=50, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
                     weights='distance')

In [58]:
y_pred = knc.predict(x_valid)
y_pred_prob = knc.predict_proba(x_valid)

In [59]:
knc.score(x_valid, y_valid)

0.9318918918918919

In [60]:
precision_score(y_valid, y_pred)

0.960087370929309

In [61]:
recall_score(y_valid, y_pred)

0.9646847565841979

In [62]:
f1_score(y_valid, y_pred)

0.9623805732484076

In [63]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9372612143495018

#### Cool, I will continue increasing leaf-size

In [64]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=92, test_size=0.15)
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1, weights='distance', leaf_size=70)
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=70, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
                     weights='distance')

In [65]:
y_pred = knc.predict(x_valid)
y_pred_prob = knc.predict_proba(x_valid)

In [66]:
knc.score(x_valid, y_valid)

0.9315315315315316

In [68]:
precision_score(y_valid, y_pred)

0.960528932193279

In [69]:
recall_score(y_valid, y_pred)

0.9637869114126097

In [70]:
f1_score(y_valid, y_pred)

0.962155163828304

In [71]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9365350296834066

#### OK,I stop here.

In [72]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=92, test_size=0.15)
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1, weights='distance', leaf_size=40) 
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=40, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
                     weights='distance')

In [73]:
y_pred = knc.predict(x_valid) 
y_pred_prob = knc.predict_proba(x_valid)

In [74]:
knc.score(x_valid, y_valid)

0.9318918918918919

In [75]:
precision_score(y_valid, y_pred)

0.960087370929309

In [76]:
recall_score(y_valid, y_pred)

0.9646847565841979

In [78]:
f1_score(y_valid, y_pred)

0.9623805732484076

In [79]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9372612143495018

#### Thus, KNN best parameters temporarily are $n\_neighbors=7, weights='distance',leaf\_size=40$

### Naive Bayes

In [80]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=1002, test_size=0.15)

In [81]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [82]:
y_pred = gnb.predict(x_valid) 
y_pred_prob = gnb.predict_proba(x_valid)

In [83]:
gnb.score(x_valid, y_valid)

0.807027027027027

In [84]:
precision_score(y_valid, y_pred)

0.9981153411232567

In [85]:
recall_score(y_valid, y_pred)

0.7887995233839737

In [86]:
f1_score(y_valid, y_pred)

0.8811980033277869

In [87]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9410166438307452

#### BN ROC performs better than KNN

### Logistic Regression

In [88]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=1002, test_size=0.15)

In [89]:
lr = LogisticRegression(n_jobs=-1) # 先来个 baseline
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [90]:
lr.score(x_valid, y_valid)

0.9807207207207207

In [91]:
y_pred = lr.predict(x_valid)
y_pred_prob = lr.predict_proba(x_valid)

In [92]:
precision_score(y_valid, y_pred)

0.9876323340259227

In [93]:
recall_score(y_valid, y_pred)

0.9911627445139509

In [94]:
f1_score(y_valid, y_pred)

0.9893943899296264

In [95]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9937303744000685

#### Performs so well...

### Use test set to choose model

#### KNN

In [96]:
knc.score(x_test, y_test)

0.9261755245826313

In [97]:
y_pred = knc.predict(x_test)
y_pred_prob = knc.predict_proba(x_test)

In [98]:
precision_score(y_test, y_pred)

0.9586574230639161

In [99]:
recall_score(y_test, y_pred)

0.9597934653800576

In [100]:
f1_score(y_test, y_pred)

0.9592251078588951

In [101]:
roc_auc_score(y_test, y_pred_prob[:, 1])

0.9230684805264014

#### Naive Bayes

In [102]:
gnb.score(x_test, y_test)

0.79621687854189

In [103]:
y_pred = gnb.predict(x_test) 
y_pred_prob = gnb.predict_proba(x_test)

In [104]:
precision_score(y_test, y_pred)

0.9984751116436118

In [105]:
recall_score(y_test, y_pred)

0.7759437954968681

In [106]:
f1_score(y_test, y_pred)

0.8732555370326268

In [107]:
roc_auc_score(y_test, y_pred_prob[:, 1])

0.9375509641130992

#### Logistic Regression

In [108]:
lr.score(x_test, y_test)

0.9790932761525502

In [109]:
y_pred = lr.predict(x_test)
y_pred_prob = lr.predict_proba(x_test)

In [110]:
precision_score(y_test, y_pred)

0.9856914401144685

In [111]:
recall_score(y_test, y_pred)

0.9912815303876756

In [112]:
f1_score(y_test, y_pred)

0.9884785819793206

In [113]:
roc_auc_score(y_test, y_pred_prob[:, 1])

0.9906651640078233

#### Logistic regression works very well, so I choose logistic regression
#### When variance in train set is dereasing but variance of test set start increasing, this is the signal of overfitting

### 找出所有预测为 1， 但是实际为 0 的文章。 作为抄袭的候选者

In [114]:
y_pred = lr.predict(X)
y_pred.shape, y.shape

((87052,), (87052,))

In [115]:
len(news_dropna)

87052

In [116]:
news_dropna.head(5)

Unnamed: 0,id,author,source,content,feature,title,url
0,89617,,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""37""...",小米MIUI 9首批机型曝光：共计15款,http://www.cnbeta.com/articles/tech/623597.htm
1,89616,,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",骁龙835在Windows 10上的性能表现有望改善,http://www.cnbeta.com/articles/tech/623599.htm
2,89615,,快科技@http://www.kkj.cn/,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\r\n...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""18""...",一加手机5细节曝光：3300mAh、充半小时用1天,http://www.cnbeta.com/articles/tech/623601.htm
3,89614,,新华社,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\r\n,"{""type"":""国际新闻"",""site"":""环球"",""commentNum"":""0"",""j...",葡森林火灾造成至少62人死亡 政府宣布进入紧急状态（组图）,http://world.huanqiu.com/hot/2017-06/10866126....
4,89613,胡淑丽_MN7479,深圳大件事,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...,"{""type"":""新闻"",""site"":""网易热门"",""commentNum"":""978"",...",44岁女子约网友被拒暴雨中裸奔 交警为其披衣相随,http://news.163.com/17/0618/00/CN617P3Q0001875...


In [117]:
news_dropna['y'] = y
news_dropna['y_pred'] = y_pred
news_dropna.head(5)

Unnamed: 0,id,author,source,content,feature,title,url,y,y_pred
0,89617,,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""37""...",小米MIUI 9首批机型曝光：共计15款,http://www.cnbeta.com/articles/tech/623597.htm,0,0
1,89616,,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",骁龙835在Windows 10上的性能表现有望改善,http://www.cnbeta.com/articles/tech/623599.htm,0,0
2,89615,,快科技@http://www.kkj.cn/,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\r\n...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""18""...",一加手机5细节曝光：3300mAh、充半小时用1天,http://www.cnbeta.com/articles/tech/623601.htm,0,0
3,89614,,新华社,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\r\n,"{""type"":""国际新闻"",""site"":""环球"",""commentNum"":""0"",""j...",葡森林火灾造成至少62人死亡 政府宣布进入紧急状态（组图）,http://world.huanqiu.com/hot/2017-06/10866126....,1,1
4,89613,胡淑丽_MN7479,深圳大件事,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...,"{""type"":""新闻"",""site"":""网易热门"",""commentNum"":""978"",...",44岁女子约网友被拒暴雨中裸奔 交警为其披衣相随,http://news.163.com/17/0618/00/CN617P3Q0001875...,0,0


In [118]:
# 实际为 0 ，预测为 1
copy_news = news_dropna[(news_dropna.y == 0) & (news_dropna.y_pred == 1)]
copy_news.head(5)

Unnamed: 0,id,author,source,content,feature,title,url,y,y_pred
51,89566,,新华网,戈壁的大漠黄沙曾掩埋了无数西域古道，而如今一条大漠天路正顽强地与黄沙“搏斗”，在乌兰布和、腾...,"{""type"":""国内新闻"",""site"":""环球"",""commentNum"":""0"",""j...",大漠变通途——世界上最长的穿越沙漠高速公路建设纪实,http://china.huanqiu.com/hot/2017-06/10866392....,0,1
56,89561,,央视新闻,很快，不少人主动添加记者为好友，询问是否需要扫描软件，并声称这些扫描软件能够攻破摄像头的IP...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",大量家庭摄像头遭入侵 有人兜售IP地址给偷窥者,http://www.cnbeta.com/articles/tech/623631.htm,0,1
61,89556,,央视新闻客户端@,在QQ搜索栏输入“摄像头 破解”，就跳出了众多相关聊天群，记者随机加入了几个，发现聊天的内容...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""26""...",你家的摄像头安全吗? 大量家庭摄像头遭入侵,http://www.cnbeta.com/articles/tech/623635.htm,0,1
63,89554,,人民日报,2017年上海国际电影电视节，包括6月12日至16日举办的第二十三届上海电视节和6月17日至...,"{""type"":""国内新闻"",""site"":""环球"",""commentNum"":""0"",""j...",聚焦上海国际电影节：“一带一路”，照耀光影世界,http://china.huanqiu.com/hot/2017-06/10866419....,0,1
68,89549,,中国新闻网,中新网6月23日电 6月22日，阿里巴巴旗下UC神马搜索主办的“U势 智赢”乘风大会在广州顺...,"{""type"":""IT业界"",""site"":""参考消息"",""commentNum"":""0"",...",乘风大会亮相广州 UC神马启动“风车计划”,http://www.cankaoxiaoxi.com/science/20170623/2...,0,1


In [119]:
len(copy_news.source)

1106

In [120]:
copy_sources = set(copy_news.source.to_list())

In [121]:
len(copy_sources)

276

## 什么是数据思维？什么是机器学习思维？

数据思维的最核心是利用数据解决问题，利用数据解决问题的最核心是要深度了解需求，了解真正要解决什么样的问题，解决问题背后的真实目的是什么。在解决问题的过程中我们使用数据的方法，通常可以叫量化的方法
机器学习思维就是根据大数据学习出一种规则，这个规则可以将输入中的 x 映射至 y 而不像传统的方法由人工写各种繁琐的规则，机器学习模型可以利用继续增加的数据不断迭代优化模型的表现