## 作业
- 使用sklearn框架库中的朴素贝叶斯算法，对垃圾邮件分类任务中的数据集进行分类实现
- 使用sklearn的KNeighborsClassifier api实现KNN算法，解决手写字体识别的分类问题
- 使用K折交叉验证寻找最优的K
- 使用KNN解决房价预测的回归问题

# 使用sklearn框架库中的朴素贝叶斯算法，对垃圾邮件分类任务中的数据集进行分类实现


In [15]:
import os
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

In [16]:
def word_segment(line, stopwords_list):
    word_list = []
    for word in jieba.cut(line):
        if word.isalpha() and word not in stopwords_list:
            word_list.append(word)
    return " ".join(word_list)

In [17]:
def feature_transform(texts):
    transformer = CountVectorizer()
    word_cnt_df = pd.DataFrame(transformer.fit_transform(texts).toarray())
    word_cnt_freq = pd.DataFrame(word_cnt_df.apply(sum, axis=0))
    word_keep = [word_cnt_freq.index[i] for i in range(word_cnt_freq.shape[0]) if word_cnt_freq.iloc[i, 0] > 5]
    features = word_cnt_df[word_keep]
    return features

In [18]:
def load_data(base_path):
    email_file_name = os.path.join(base_path, "chinesespam.xlsx")
    stopword_file_name = os.path.join(base_path, "stopwords.txt")
    stopwords_list = [i.strip() for i in open(stopword_file_name, 'r', encoding='utf8').readlines()]
    email_df = pd.read_excel(email_file_name, sheet_name=0)
    email_df['text'] = email_df.text.apply(lambda x: word_segment(x, stopwords_list))
    features = feature_transform(email_df['text'])
    labels = email_df['type']
    x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=1)
    return x_train, x_test, y_train, y_test

In [19]:
def model_training(x_train, y_train):
    model = BernoulliNB()
    model.fit(x_train, y_train)
    return model

In [20]:
base_path = "./raw_data"
x_train, x_test, y_train, y_test = load_data(base_path)
model = model_training(x_train, y_train)
print(model.score(x_test, y_test))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.824 seconds.
Prefix dict has been built successfully.


0.9


# 使用sklearn的KNeighborsClassifier api实现KNN算法，解决手写字体识别的分类问题

In [21]:
import os
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

In [22]:
def get_label_feature(folder_path):
    label_list = []
    feature_list = []
    for file_name in os.listdir(folder_path):
        label = file_name.split("_")[0]
        context = open(os.path.join(folder_path, file_name), 'r').read()
        feature = [int(i) for i in list("".join(context.split("\n")))]
        label_list.append(label)
        feature_list.append(feature)
    return np.array(feature_list), np.array(label_list)

In [23]:
def load_data(base_path):
    train_folder_path = os.path.join(base_path, "trainingDigits")
    test_folder_path = os.path.join(base_path, "testDigits")
    x_train, y_train = get_label_feature(train_folder_path)
    x_test, y_test = get_label_feature(test_folder_path)
    return x_train, y_train, x_test, y_test

In [24]:
def model_training(x_train, y_train):
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(x_train, y_train)
    return model

In [27]:
data_path = "./raw_data"
base_path = os.path.join(data_path, "handwritingClass")
x_train, y_train, x_test, y_test = load_data(base_path)
model = model_training(x_train, y_train)
print(model.score(x_test, y_test))

0.9809725158562368


# 使用K折交叉验证寻找最优的K

In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
def cross_validation(x_train, y_train, parameters):
    estimator = KNeighborsClassifier()
    clf = GridSearchCV(estimator=estimator, param_grid=parameters, cv=3, scoring='accuracy')
    clf.fit(x_train, y_train)
    return clf

In [32]:
parameters = {"n_neighbors": range(1, 10, 2)}
clf = cross_validation(x_train, y_train, parameters)
best_k = clf.best_params_
print("The best parameter is {}".format(best_k))

The best parameter is {'n_neighbors': 3}


# 使用KNN解决房价预测的回归问题

In [33]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

In [34]:
def load_data(base_path):
    file_path = os.path.join(base_path, "housing.data.txt")
    col_names = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"]
    df = pd.read_table(file_path, sep='\s+', names=col_names)
    label = np.array(df.pop('MEDV'))
    scaler = MinMaxScaler()
    features = scaler.fit_transform(df)
    x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.3, random_state=123)
    return x_train, x_test, y_train, y_test

In [35]:
def model_traning(x_train, y_train, parameters):
    estimator = KNeighborsRegressor()
    clf = GridSearchCV(estimator=estimator, param_grid=parameters, cv=3, refit=True, scoring='neg_mean_squared_error')
    clf.fit(x_train, y_train)
    return clf

In [36]:
base_path = "./raw_data/housingPrice"
x_train, x_test, y_train, y_test = load_data(base_path)
parameters = {"n_neighbors": range(1, 10, 2)}
clf = model_traning(x_train, y_train, parameters)
print(clf.best_estimator_.score(x_test, y_test))

0.6144123268952374
