In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from scipy.stats import randint
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA

In [2]:
def DealMissingData(X, dfX):
    ''' deal with missing data
        X: data, type(numpy array)
        dfX: same as X data, type(pandas array) '''
    
    print("----------- Start deal missing data -----------")
    TFarr = np.array(dfX.isna().any())
    for index, ele in enumerate(TFarr):
        if ele:
            if isinstance(X[0][index], str):
                # deal with string data
                imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
                imputer.fit(X[:, [index]])
                X[:, [index]] = imputer.transform(X[:, [index]])
            else:
                # deal with digital data
                imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
                imputer.fit(X[:, [index]])
                X[:, [index]] = imputer.transform(X[:, [index]])
            print(f"column {index} have missing data, fixed!")
        else:
            print(f"column {index} not have missing data")
            
    print("----------- End deal missing data! -----------")
    return X

In [3]:
def MakeOneHot(X, pos={}):
    ''' make one-hot  
        X: data, type(numpy array)
        pos: where need to onehot, type(dictionary) '''
    
    print("----------- Start onehot -----------")
    FeaturesNum = len(X[0])    # init
    if bool(pos):
        # custom onehot (onehot pos that u want to)
        for key in pos:
            print(f"column {FeaturesNum-pos[key]} need to one-hot, fixed!")
            ct = ColumnTransformer([(key, OneHotEncoder(), [FeaturesNum-pos[key]])], remainder='passthrough')
            NewX = ct.fit_transform(X)
            X = NewX[:, 1:]
            FeaturesNum = len(X[0])
    else:
        # auto onehot (only onehot string cols)
        i = 0
        while i < FeaturesNum:
            if isinstance(X[0][i], str) or i==3:
                print(f"column {i}({X[0][i]}) need to one-hot, fixed!")
                ct = ColumnTransformer([(str(i), OneHotEncoder(), [i])], remainder='passthrough')
                NewX = ct.fit_transform(X)[:, 1:]
                i += len(NewX[0]) - len(X[0])
                X = NewX
                FeaturesNum = len(X[0])
            i += 1
    print("----------- End onehot -----------")
    return X


In [4]:
def NormalizeData(data_train, data_test):
    ''' normalize data
        data_train: training data, type(numpy array)
        data_test: testing data, type(numpy array) '''
    
    print("----------- Start normalize -----------")
    sc = StandardScaler()
    data_train = sc.fit_transform(data_train)
    data_test = sc.transform(data_test)
    
    print("----------- End normalize -----------")
    return data_train, data_test

In [5]:
def DrawRelationship(X, y, label_x, label_y):
    for key in label_x:
        allarr = []
        for i in range(len(X)):
            allarr.append(X[i][label_x[key]])
            
        plt.scatter(allarr, y, c="red")
        plt.xlabel(key)
        plt.ylabel(label_y)
        plt.show() 

In [6]:
def DrawPredict(gt, pred, title):
    ''' data [[groundTruth, predict], [], ....]'''
    
    data = []
    for i in range(len(gt)):
        data.append([gt[i], pred[i]])
        
    data.sort(key=lambda x:x[0])
    for index, ele in enumerate(data):
        plt.scatter(index, data[index][1], c="blue", s=0.7)
        plt.scatter(index, data[index][0], c="red", s=0.7)
    plt.xlabel("dataNums")
    plt.ylabel("charges")
    plt.title(title)
    plt.show() 

In [7]:
def TrainAndTestModel(model, X_train, y_train, X_test, y_test, title):
    print("Start fit data")
    model.fit(X_train, y_train)
    print("fit complet")
    
    train_pred = model.predict(X_train)
    train_acc = model.score(X_train, y_train)
    #train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    #DrawPredict(y_train, train_pred, title+"_Train" + "\n" + f"ACC: {train_acc}")
    
    test_pred = model.predict(X_test)
    test_acc = model.score(X_test, y_test)
    #test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    #DrawPredict(y_test, test_pred, title+"_Test" + "\n" + f"ACC: {test_acc}")
    
    train_cm = confusion_matrix(y_train, train_pred)
    test_cm = confusion_matrix(y_test, test_pred)
    print(f"Train_cm:\n{train_cm}")
    print(f"Test_cm:\n{test_cm}")
    print(f"Train acc: {train_acc}")
    print(f"Test acc: {test_acc}")
    
    return model

In [8]:
df = pd.read_csv("./DATA/final_project_dataset_2.csv")

dfX = df.iloc[:, 1:-1]
X = dfX.values
dfy = df.iloc[:, [-1]]
y = dfy.values
'''
label_x = {'Location': 0, 'MinTemp': 1, 'MaxTemp': 2,
           'Rainfall': 3, 'Evaporation': 4, 'Sunshine': 5,
           'WindGustDir': 6, 'WindGustSpeed': 7, 'WindDir9am': 8,
           'WindDir3pm': 9, 'WindSpeed9am': 10, 'WindSpeed3pm': 11,
           'Humidity9am': 12, 'Humidity3pm': 13, 'Pressure9am': 14,
           'Pressure3pm': 15, 'Cloud9am': 16, 'Cloud3pm': 17,
           'Temp9am': 18, 'Temp3pm': 19, 'RainToday': 20
          }
'''
#DrawRelationship(X, y, label_x=label_x, label_y='RainTomorrow')

X = DealMissingData(X, dfX)
y = DealMissingData(y, dfy)

print(X[0])
print(y[0])
#pos2onehot = {'Sex': 5, 'children': 3, 'Smoker': 2, 'Region': 1}
X = MakeOneHot(X)
y = MakeOneHot(y)
y = y.reshape(1, -1)[0]

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train, X_test = NormalizeData(X_train, X_test)
print(X[0])
print(y[0])
print("Preprocessing data done!")

----------- Start deal missing data -----------
column 0 not have missing data
column 1 have missing data, fixed!
column 2 have missing data, fixed!
column 3 have missing data, fixed!
column 4 have missing data, fixed!
column 5 have missing data, fixed!
column 6 have missing data, fixed!
column 7 have missing data, fixed!
column 8 have missing data, fixed!
column 9 have missing data, fixed!
column 10 have missing data, fixed!
column 11 have missing data, fixed!
column 12 have missing data, fixed!
column 13 have missing data, fixed!
column 14 have missing data, fixed!
column 15 have missing data, fixed!
column 16 have missing data, fixed!
column 17 have missing data, fixed!
column 18 have missing data, fixed!
column 19 have missing data, fixed!
column 20 have missing data, fixed!
----------- End deal missing data! -----------
----------- Start deal missing data -----------
column 0 not have missing data
----------- End deal missing data! -----------
['Albury' 13.4 22.9 0.6 5.46982421634

In [9]:
pca = PCA(n_components=50)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[0.0516031  0.03238822 0.02460146 0.02268849 0.01912658 0.01843925
 0.01641429 0.01622303 0.01473322 0.01395672 0.01322049 0.0127462
 0.0122484  0.01194041 0.01174589 0.01125593 0.01115764 0.01104661
 0.01090434 0.01084147 0.01055176 0.01044841 0.01034639 0.01022954
 0.01016913 0.01004155 0.00998548 0.009939   0.00988138 0.0097174
 0.009696   0.00967766 0.00956333 0.00952121 0.00950352 0.00940687
 0.00939783 0.00932326 0.00931201 0.00926655 0.00923699 0.00922514
 0.0091603  0.00911909 0.00908789 0.0090626  0.00899397 0.00897306
 0.00891726 0.00888736]


In [9]:
from datetime import datetime
now = datetime.now()
print("now =", now)

now = 2020-12-31 16:12:13.564799


In [10]:
#TrainAndTestModel(SVC(), X_train, y_train, X_test, y_test, "SVC")
TrainAndTestModel(LogisticRegression(verbose=True), X_train, y_train, X_test, y_test, "Logistic")

Start fit data


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


fit complet
Train_cm:
[[83439  4810]
 [12481 13024]]
Test_cm:
[[20887  1180]
 [ 3102  3270]]
Train acc: 0.847996553967333
Test acc: 0.8494321178663103


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s finished


LogisticRegression(verbose=True)

In [11]:
now = datetime.now()
print("now =", now)

now = 2020-12-31 16:12:14.888236


In [14]:
TrainAndTestModel(DecisionTreeClassifier(), X_train, y_train, X_test, y_test, "DecisionTree")

Start fit data
fit complet
Train_cm:
[[88248     1]
 [    3 25502]]
Test_cm:
[[19119  2948]
 [ 2891  3481]]
Train acc: 0.9999648364013574
Test acc: 0.79468335736137


DecisionTreeClassifier()

In [15]:
TrainAndTestModel(RandomForestClassifier(), X_train, y_train, X_test, y_test, "RandomForest")

Start fit data
fit complet
Train_cm:
[[88248     1]
 [    4 25501]]
Test_cm:
[[21219   848]
 [ 3160  3212]]
Train acc: 0.9999560455016966
Test acc: 0.8590667744998066


RandomForestClassifier()