In [2]:
import pandas as pd 
import numpy as np
data=pd.read_csv("train_values.csv")
label=pd.read_csv("train_labels.csv")

## Naive Bayes Classifiers

In [2]:
# pre-processing the data
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()

#use data_fit for the label of every label of the colone
data_fit = data

colone = list(data)
for colone in colone:
    data_fit[colone] = le.fit_transform(data[colone])
#这里必须要将训练集里面的非数字特征 转为 数字

In [3]:
#data and label
dataset = np.array(data_fit)
label_data = np.array(label["damage_grade"])


In [4]:
## split test and train  (set)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test  = train_test_split(dataset, label_data, test_size=0.2, random_state=50) 


In [5]:
# Using Gaussian Bayesian models
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
#Set classifier
clf=GaussianNB()
#Training classifier
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))


[[  372  4596     0]
 [  501 29153     4]
 [  153 17336     6]]
              precision    recall  f1-score   support

           1       0.36      0.07      0.12      4968
           2       0.57      0.98      0.72     29658
           3       0.60      0.00      0.00     17495

    accuracy                           0.57     52121
   macro avg       0.51      0.35      0.28     52121
weighted avg       0.56      0.57      0.42     52121



## Support Vector Machine

In [20]:
# 600 samples from randomly selected data because the svm algorithm is particularly slow
data=pd.read_csv("train_values.csv")
data=data.sample(n=1000)
label=pd.read_csv("train_labels.csv")
label=label.sample(n=1000)

In [21]:
# pre-processing the data
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()


#use data_fit for the label of every label of the colone
data_fit = data

colone = list(data)
for colone in colone:
    data_fit[colone] = le.fit_transform(data[colone])

In [22]:
#dimensionality reduction
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
print(pca)
data_fit=pca.fit_transform(data_fit)

PCA(n_components=10)


In [23]:
#data and label                           
dataset = np.array(data_fit)
label_data = np.array(label["damage_grade"])

In [24]:
#unbalanced data and label
label['damage_grade'].value_counts()

2    587
3    321
1     92
Name: damage_grade, dtype: int64

In [25]:
#One of the simplest ways to generate a sample of unbalanced data is to generate a sample of minority classes.
# turn into balanced data and set
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
x_resampled, y_resampled = ros.fit_resample(dataset, label_data)

In [26]:
import collections
print(sorted(collections.Counter(y_resampled).items()))

[(1, 587), (2, 587), (3, 587)]


In [27]:
## split test and train  (set)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test  = train_test_split(x_resampled, y_resampled, test_size=0.3, random_state=50)

In [28]:
sorted(collections.Counter(y_test).items())


[(1, 161), (2, 183), (3, 185)]

In [29]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

clf = SVC(kernel='linear')  
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[100  53   8]
 [ 86  57  40]
 [ 89  63  33]]
              precision    recall  f1-score   support

           1       0.36      0.62      0.46       161
           2       0.33      0.31      0.32       183
           3       0.41      0.18      0.25       185

    accuracy                           0.36       529
   macro avg       0.37      0.37      0.34       529
weighted avg       0.37      0.36      0.34       529



1. accuracy 就是该模型训练的准确率
2. macro avg 和 weighted avg 参数看下面的链接
<https://blog.csdn.net/lyb3b3b/article/details/84819931>
3. support表示在test集里面各类出现的次数

1. Linear核
主要用于线性可分的情形。参数少，速度快，对于一般数据，分类效果已经很理想了。
2. RBF核
主要用于线性不可分的情形。参数多，分类结果非常依赖于参数。通过训练数据的交叉验证来寻找合适的参数，验证过程比较耗时。
<https://blog.csdn.net/weixin_40835491/article/details/89301721>

### classification_report函数
> https://www.cnblogs.com/178mz/p/8558435.html

## Tree-based techniques

In [6]:
#use one hot code for the data
data_one_hot = pd.get_dummies(data)

In [7]:
#data and label
dataset = np.array(data_one_hot)
label_data = np.array(label["damage_grade"])


In [8]:
## split test and train  (set)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test  = train_test_split(dataset, label_data, test_size=0.2, random_state=50)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

clf = RandomForestClassifier(n_estimators=1000,max_features=4)  # square root of number of features 
clf = clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[ 2173  2686   109]
 [ 1034 25059  3565]
 [  108  7582  9805]]
              precision    recall  f1-score   support

           1       0.66      0.44      0.52      4968
           2       0.71      0.84      0.77     29658
           3       0.73      0.56      0.63     17495

    accuracy                           0.71     52121
   macro avg       0.70      0.61      0.64     52121
weighted avg       0.71      0.71      0.70     52121



### tree
> https://blog.csdn.net/qq_16633405/article/details/61200502
***
* n_estimators : integer, optional (default=10)
  随机森林中树的个数，即学习器的个数。
* max_features : (default=”auto”)
  划分叶子节点，选择的最大特征数目