# Part I 

[Gas sensors for home activity monitoring Data Set](https://archive.ics.uci.edu/ml/datasets/Gas+sensors+for+home+activity+monitoring)


[KNN](https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/supervised_learning/k_nearest_neighbors.py)

[ML-From-Scratch Logistic Regression](https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/supervised_learning/logistic_regression.py)

[Numpy LR](https://blog.goodaudience.com/logistic-regression-from-scratch-in-numpy-5841c09e425f)

## Preprocessing

In [266]:
import numpy as np

In [267]:
metadata=np.loadtxt("./data/HT_Sensor_UCIsubmission/HT_Sensor_metadata.dat",skiprows=1,dtype=str)

In [268]:
metadata[metadata[:,2]=="background",2]=2
metadata[metadata[:,2]=="banana",2]=0
metadata[metadata[:,2]=="wine",2]=1

metadata=np.array(metadata[:,[0,2,3,4]],dtype=float)

In [278]:
dataset = np.loadtxt('./data/HT_Sensor_UCIsubmission/HT_Sensor_dataset.dat', skiprows=1)
datasetID=np.array(dataset[:,0],dtype=int)

In [279]:
# 仅仅挑选ID为banana与wine的
# 以及时间恰好有这两者存在的时间区间
selected = np.logical_and(metadata[datasetID,1]!=2,dataset[:,1]>0,dataset[:,1]<metadata[datasetID,3])
data=dataset[selected]
dataID=np.array(data[:,0],dtype=int)
data[:,0]=metadata[dataID,1]

#混淆in-place
np.random.shuffle(data)
#归一化
dataID=np.array(data[:,0],dtype=int)
data=(data-data.mean(axis=0))/data.std(axis=0)
data[:,0]=dataID
np.save("./data/HT_Sensor_UCIsubmission/data.npy",data)

## KNN

In [280]:
import numpy as np
from collections import Counter

In [281]:
data=np.load("./data/HT_Sensor_UCIsubmission/data.npy")
test_size=data.shape[0]//2
#分割train test 10%
X_test=data[:test_size,2:10]
y_test=data[:test_size,0].astype('int')

X_train=data[test_size:,2:10]
y_train=data[test_size:,0].astype('int')

print(X_test.shape,y_test.shape,X_train.shape,y_train.shape,data.shape)

(208227, 8) (208227,) (208228, 8) (208228,) (416455, 12)


In [282]:
class KNN():
    def __init__(self, k=5):
        self.k = k

    def predict(self, X_test, X_train, y_train):
        y_pred=np.empty(X_test.shape[0],dtype=int)
        for i,X in enumerate(X_test):
            print(i)
            y_pred[i]=np.bincount(y_train[np.argsort(np.linalg.norm(X-X_train,axis=1))[:self.k]]).argmax()
        return y_pred

In [283]:
model=KNN(1)
y_pred=model.predict(X_test[:100],X_train[:100],y_train)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [284]:
result=Counter(np.equal(y_pred[:100],y_test[:100])).most_common()
result

[(True, 83), (False, 17)]

In [285]:
y_pred[:1000]

array([1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0])

## Logistic Regression

In [286]:
import numpy as np
from collections import Counter
import math

In [287]:
data=np.load("./data/HT_Sensor_UCIsubmission/data.npy")
test_size=data.shape[0]//10
#分割train test 10%
X_test=data[:test_size,2:10]
y_test=data[:test_size,0].astype('int')

X_train=data[test_size:,2:10]
y_train=data[test_size:,0].astype('int')

print(X_test.shape,y_test.shape,X_train.shape,y_train.shape,data.shape)

(41645, 8) (41645,) (374810, 8) (374810,) (416455, 12)


In [306]:
class LogisticRegression():
    def __init__(self,lr=0.1):
        self.lr=lr
        
    def sigmoid(self,Z):
        return 1/(1+np.exp(-Z))
    
    def loss(self,y,y_hat):
        return -np.mean(y * np.log(y_hat)+(1-y)*np.log(1-y_hat))
    
    def fit(self,X_train,y_train,epochs=5000):
        limit=1/math.sqrt(X_train.shape[1])
        self.W=np.random.uniform(-limit,limit,(X_train.shape[1],))
        
        for i in range(epochs):
            y_hat=self.sigmoid(X_train @ self.W)
            self.W -= self.lr * (X_train.T @ (y_hat - y_train) / y_train.shape[0])
            if i %100 ==0:
                print(i,self.loss(y_train,y_hat))
                
    def predict(self,X_test):
        y_pred=self.sigmoid(X_test @ self.W)>0.5
        return y_pred.astype('int')

In [307]:
model=LogisticRegression(0.25)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

0 0.656998734976901
100 0.5713823051272089
200 0.569312030345533
300 0.5688399303241818
400 0.5686032337857182
500 0.5684460813783249
600 0.5683343785838477
700 0.5682537274781887
800 0.5681952639129977
900 0.568152835221936
1000 0.5681220321959587
1100 0.5680996665544801
1200 0.5680834264506179
1300 0.5680716340023159
1400 0.5680630710604959
1500 0.5680568531482376
1600 0.568052338037861
1700 0.5680490593944897
1800 0.5680466786000744
1900 0.5680449497736619
2000 0.5680436943725287
2100 0.5680427827498437
2200 0.5680421207634613
2300 0.5680416400523471
2400 0.5680412909760947
2500 0.5680410374881253
2600 0.5680408534129645
2700 0.5680407197430384
2800 0.5680406226757668
2900 0.568040552188212
3000 0.5680405010020572
3100 0.5680404638320237
3200 0.5680404368401077
3300 0.5680404172392644
3400 0.5680404030056222
3500 0.5680403926695026
3600 0.5680403851636647
3700 0.5680403797131064
3800 0.5680403757550413
3900 0.5680403728807886
4000 0.5680403707935735
4100 0.5680403692778874
4200 0.56

In [304]:
Counter(y_pred == y_test).most_common()

[(True, 28732), (False, 12913)]

# Part II-1

In [308]:
import numpy as np

In [319]:
ratings=np.loadtxt("./data/ml-1m/ratings.dat",delimiter="::",dtype=int)

In [320]:
ratings

array([[        1,      1193,         5, 978300760],
       [        1,       661,         3, 978302109],
       [        1,       914,         3, 978301968],
       ...,
       [     6040,       562,         5, 956704746],
       [     6040,      1096,         4, 956715648],
       [     6040,      1097,         4, 956715569]])

In [342]:
data=np.zeros((6040,3952))

In [343]:
for r in ratings:
    data[r[0]-1,r[1]-1]=r[2]

In [340]:
data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 3., 0., ..., 0., 0., 0.]])

In [None]:
data[]