In [88]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC,SVR
from sklearn import preprocessing
import sklearn.metrics as sm
from sklearn.model_selection import train_test_split,cross_val_score

# load datasets
def load_data(input_file):
    X = []
    with open(input_file,'r')as f:
        for line in f.readlines():
            data = [x for x in line[:-1].split(',')]
            X.append([data[0]]+data[2:])
    return np.array(X)

def load_data_all(input_file):
    X = []
    with open(input_file,'r') as f:
        for line in f.readlines():
            data = line[:-1].split(',')
            X.append(data)
    return np.array(X)

def label_encode(X):
    label_encoder = []  # label_encoder的作用域，限制了function的使用范围
    X_encoded = np.empty(X.shape)
    for i,item in enumerate(X[0]):
        if item.isdigit():
            X_encoded[:,i] = X[:,i]
        else:
            label_encoder.append(preprocessing.LabelEncoder())
            X_encoded[:,i] = label_encoder[-1].fit_transform(X[:,i])
        
    return np.array(X_encoded[:,:-1].astype(int)), np.array(X_encoded[:,-1].astype(int))

In [95]:
# 建立SVM(此处作为一个分类器) 预测移动大楼进出楼门的人数
# building binary event : 是否举行活动
# building multiclass event：举行何种活动

# load datasets
input_file = 'building_event_multiclass.txt'
X = load_data(input_file)

# label encode : 注意np.array(X，y)的赋值方式[:,:-1]    
label_encoder = []
X_encoded = np.empty(X.shape)
for i,item in enumerate(X[0]):
    if item.isdigit():
        X_encoded[:,i] = X[:,i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:,i] = label_encoder[-1].fit_transform(X[:,i])

X,y = np.array(X_encoded[:,:-1].astype(int)), np.array(X_encoded[:,-1].astype(int))

print(X[:4],y[:4])
print(label_encoder)
print(type(label_encoder))

# SVM
params = {'kernel':'rbf', 'probability':True, 'class_weight':'balanced'}
classifier = SVC(**params)
classifier.fit(X, y)

# cross validation
accuracy = cross_val_score(classifier, X, y, scoring='accuracy', cv=3)
print("\nClassification Accuracy :",round(100*accuracy.mean(),3),"%")
print(label_encoder[0].transform(['Tuesday']))

# 对新的输入数据进行预测
input_data = ['Tuesday', '09:30:00', '15', '13']
input_data_encoded = [-1]*len(input_data)
count = 0
for i,item in enumerate(input_data):
    if item.isdigit():
        input_data_encoded[i] = int(input_data[i])
    else:
        input_data_encoded[i] = int(label_encoder[count].transform([input_data[i]]))
        count += 1
input_data_encoded = np.array(input_data_encoded).reshape(1,-1)
print(input_data_encoded,type(input_data_encoded))
output_class = classifier.predict(input_data_encoded)
print("\nOutput class:",label_encoder[-1].inverse_transform(output_class)[0])


[[ 4  6  9 26]
 [ 4  7  8 13]
 [ 4  8  6  3]
 [ 4  9 25 12]] [0 0 0 0]
[LabelEncoder(), LabelEncoder(), LabelEncoder()]
<class 'list'>

Classification Accuracy : 58.514 %
[4]
[[ 4  2 15 13]] <class 'numpy.ndarray'>

Output class: eventA


In [93]:
# 使用SVM（此处作为回归器）估算交通流量

# load and label datasets
input_file = 'traffic_data.txt'
X = load_data_all(input_file)
print(X[:4])

label_encoder = []
X_encoded = np.empty(X.shape)
for i,item in enumerate(X[0]):
    if item.isdigit():
        X_encoded[:,i] = X[:,i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:,i] = label_encoder[-1].fit_transform(X[:,i])
        
X, y = np.array(X_encoded[:,:-1].astype(int)), np.array(X_encoded[:,-1].astype(int))

print(X[:4],y[:4])
print(label_encoder)
print(type(label_encoder))

# training regressor SVM
params = {'kernel':'rbf', 'C': 10.0, 'epsilon':0.2} # C:错误惩罚，epsilon：不使用惩罚的限制
regressor = SVR(**params)
regressor.fit(X,y)

# cross validation
y_pred = regressor.predict(X)
print("\nMean absolute error =",round(sm.mean_absolute_error(y,y_pred),2))

# test on a new datapoint
input_data = ['Tuesday', '13:35', 'San Francisco', 'yes']
input_data_encoded = [-1] * len(input_data)
count = 0

for i,item in enumerate(input_data):
    if item.isdigit():
        input_data_encoded[i] = int(input_data[i])
    else:
        input_data_encoded[i] = int(label_encoder[count].transform([input_data[i]]))
        count = count + 1
input_data_encoded = np.array(input_data_encoded).reshape(1,-1)
print("Predicted traffic:", int(regressor.predict(input_data_encoded)[0]))

[['Tuesday' '00:00' 'San Francisco' 'no' '3']
 ['Tuesday' '00:05' 'San Francisco' 'no' '8']
 ['Tuesday' '00:10' 'San Francisco' 'no' '10']
 ['Tuesday' '00:15' 'San Francisco' 'no' '6']]
[[ 5  0 13  0]
 [ 5  1 13  0]
 [ 5  2 13  0]
 [ 5  3 13  0]] [ 3  8 10  6]
[LabelEncoder(), LabelEncoder(), LabelEncoder(), LabelEncoder()]
<class 'list'>

Mean absolute error = 7.28
Predicted traffic: 29
