# K-近邻算法（KNN）

#### 1、预测年收入是否大于50K美元

读取adult.txt文件，最后一列是年收入，并使用KNN算法训练模型，然后使用模型预测一个人的年收入是否大于50

In [95]:
# 处理分类问题
from sklearn.neighbors import KNeighborsClassifier
# 处理回归问题
from sklearn.neighbors import KNeighborsRegressor
import numpy as np 
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
adults = pd.read_table('../data/adults.txt',sep=',',low_memory=False)
adults.head()

Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


获取年龄、教育程度、职位、每周工作时间作为机器学习数据  
获取薪水作为对应结果

In [14]:
adults_data = adults[['age','education','occupation','hours_per_week']]
adults_data.head()

Unnamed: 0,age,education,occupation,hours_per_week
0,39,Bachelors,Adm-clerical,40
1,50,Bachelors,Exec-managerial,13
2,38,HS-grad,Handlers-cleaners,40
3,53,11th,Handlers-cleaners,40
4,28,Bachelors,Prof-specialty,40


数据转换，将String类型数据转换为int

In [13]:
Series(adults_data['education']).unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

In [15]:
Series(adults_data['occupation']).unique()

array(['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners',
       'Prof-specialty', 'Other-service', 'Sales', 'Craft-repair',
       'Transport-moving', 'Farming-fishing', 'Machine-op-inspct',
       'Tech-support', '?', 'Protective-serv', 'Armed-Forces',
       'Priv-house-serv'], dtype=object)

【知识点】map方法，进行数据转换

切片：训练数据和预测数据

生成算法

第一步：训练数据

第二步：预测数据

#### 2、小麦种类预测

读取seeds.tsv文件，最后一列是小麦品种，其他列是小麦特征

In [59]:
seeds = pd.read_table('../data/seeds.tsv',sep='	',low_memory=False)
seeds.columns=list([1,2,3,4,5,6,7,8])
seeds

Unnamed: 0,1,2,3,4,5,6,7,8
0,14.88,14.57,0.8811,5.554,3.333,1.0180,4.956,Kama
1,14.29,14.09,0.9050,5.291,3.337,2.6990,4.825,Kama
2,13.84,13.94,0.8955,5.324,3.379,2.2590,4.805,Kama
3,16.14,14.99,0.9034,5.658,3.562,1.3550,5.175,Kama
4,14.38,14.21,0.8951,5.386,3.312,2.4620,4.956,Kama
5,14.69,14.49,0.8799,5.563,3.259,3.5860,5.219,Kama
6,14.11,14.10,0.8911,5.420,3.302,2.7000,5.000,Kama
7,16.63,15.46,0.8747,6.053,3.465,2.0400,5.877,Kama
8,16.44,15.25,0.8880,5.884,3.505,1.9690,5.533,Kama
9,15.26,14.85,0.8696,5.714,3.242,4.5430,5.314,Kama


In [78]:
# 将数据打乱
# 这样对可以对df进行shuffle。
# 其中参数frac是要返回的比例，比如df中有10行数据，我只想返回其中的30%,那么frac=0.3。
data = DataFrame(seeds).sample(frac=0.8)
y_train = data[8]
X_train = data.iloc[:,0:7]

In [79]:
display(X_train.shape,y_train.shape)

(167, 7)

(167,)

In [97]:
display(X_train,y_train)

Unnamed: 0,1,2,3,4,5,6,7
194,12.80,13.47,0.8860,5.160,3.126,4.8730,4.914
151,12.26,13.60,0.8333,5.408,2.833,4.7560,5.360
160,12.02,13.33,0.8503,5.350,2.810,4.2710,5.308
31,14.09,14.41,0.8529,5.717,3.186,3.9200,5.299
153,11.36,13.05,0.8382,5.175,2.755,4.0480,5.263
116,19.15,16.45,0.8890,6.245,3.815,3.0840,6.185
164,12.10,13.15,0.8793,5.105,2.941,2.2010,5.056
130,18.94,16.32,0.8942,6.144,3.825,2.9080,5.949
63,12.78,13.57,0.8716,5.262,3.026,1.1760,4.782
136,15.57,15.15,0.8527,5.920,3.231,2.6400,5.879


194    Canadian
151    Canadian
160    Canadian
31         Kama
153    Canadian
116        Rosa
164    Canadian
130        Rosa
63         Kama
136        Rosa
113        Rosa
9          Kama
91         Rosa
188    Canadian
146    Canadian
201    Canadian
13         Kama
37         Kama
66         Kama
129        Rosa
175    Canadian
32         Kama
182    Canadian
43         Kama
122        Rosa
101        Rosa
26         Kama
208    Canadian
56         Kama
204    Canadian
         ...   
94         Rosa
36         Kama
79         Rosa
186    Canadian
148    Canadian
42         Kama
21         Kama
103        Rosa
53         Kama
99         Rosa
58         Kama
52         Kama
138        Rosa
134        Rosa
158    Canadian
87         Rosa
75         Rosa
69         Rosa
15         Kama
162    Canadian
190    Canadian
184    Canadian
198    Canadian
199    Canadian
70         Rosa
89         Rosa
95         Rosa
161    Canadian
120        Rosa
83         Rosa
Name: 8, Length: 167, dt

In [86]:
data = DataFrame(seeds).sample(frac=0.2)
y_test = data[8]
X_test = data.iloc[:,0:7]

In [87]:
display(X_test,y_test)

Unnamed: 0,1,2,3,4,5,6,7
176,10.74,12.73,0.8329,5.145,2.642,4.702,4.963
91,18.81,16.29,0.8906,6.272,3.693,3.237,6.053
18,12.72,13.57,0.8686,5.226,3.049,4.102,4.914
82,19.57,16.74,0.8779,6.384,3.772,1.472,6.273
66,14.01,14.29,0.8625,5.609,3.158,2.217,5.132
156,12.13,13.73,0.8081,5.394,2.745,4.825,5.22
111,19.13,16.31,0.9035,6.183,3.902,2.109,5.924
78,17.12,15.55,0.8892,5.85,3.566,2.858,5.746
190,11.27,12.86,0.8563,5.091,2.804,3.985,5.001
99,16.41,15.25,0.8866,5.718,3.525,4.217,5.618


176    Canadian
91         Rosa
18         Kama
82         Rosa
66         Kama
156    Canadian
111        Rosa
78         Rosa
190    Canadian
99         Rosa
122        Rosa
117        Rosa
93         Rosa
154    Canadian
90         Rosa
54         Kama
200    Canadian
37         Kama
109        Rosa
27         Kama
25         Kama
14         Kama
103        Rosa
23         Kama
164    Canadian
100        Rosa
173    Canadian
134        Rosa
86         Rosa
121        Rosa
112        Rosa
74         Rosa
129        Rosa
95         Rosa
36         Kama
83         Rosa
17         Kama
144    Canadian
201    Canadian
125        Rosa
11         Kama
208    Canadian
Name: 8, dtype: object

In [80]:
knnclf = KNeighborsClassifier(n_neighbors=5)

In [81]:
knnclf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [91]:
y_ = knnclf.predict(X_test)
# (y_ == y_test).sum()/y_.size
knnclf.score(X_test,y_test)

0.8809523809523809

In [99]:
y_train.unique()

array(['Canadian', 'Kama', 'Rosa'], dtype=object)

In [98]:
a = ['Canadian', 'Kama', 'Rosa']


TypeError: 'int' object is not iterable

#### 3、改进约会网站的匹配效果

读取datingTestSet.txt文件，最后一列是喜欢程度。模型：根据前几列的信息，预测喜欢程度

In [103]:
dating = pd.read_table('../data/datingTestSet.txt',sep='	',low_memory=False)
dating

Unnamed: 0,40920,8.326976,0.953952,largeDoses
0,14488,7.153469,1.673904,smallDoses
1,26052,1.441871,0.805124,didntLike
2,75136,13.147394,0.428964,didntLike
3,38344,1.669788,0.134296,didntLike
4,72993,10.141740,1.032955,didntLike
5,35948,6.830792,1.213192,largeDoses
6,42666,13.276369,0.543880,largeDoses
7,67497,8.631577,0.749278,didntLike
8,35483,12.273169,1.508053,largeDoses
9,50242,3.723498,0.831917,didntLike
