In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_data = pd.read_csv("../input/tabular-playground-series-may-2021/train.csv")
train_data.head()

In [None]:
test_data = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")
test_data.head()

In [None]:
#由此我们可以发现类别并不均衡
target_mass = train_data['target'].value_counts()
values = target_mass.values.tolist()
indexes = target_mass.index.tolist()
ax,fig = plt.subplots(1,2,figsize=(12,3))
plt.subplot(1,2,1)
plt.pie(values , labels = indexes, shadow=True,autopct='%.2f%%',colors=['b','r','g','y','c','m','y','k','c','g','g'])
plt.subplot(1,2,2)
plt.bar(indexes,values,color="green")
plt.show()

In [None]:
Train = train_data.drop('id',1)
Test = test_data.drop('id',1)

In [None]:
#进行数据的归一化
def minmaxscaler(data1, data2):
    
    for feature in data2.columns:
        if data1[feature].dtype != 'object':
            min_value = min(data1[feature])
            max_value = max(data1[feature])
            data1[feature] = (data1[feature]-min_value) / (max_value-min_value)
            data2[feature] = (data2[feature]-min_value) / (max_value-min_value)
    
    return data1,data2

In [None]:
Train,Test = minmaxscaler(Train,Test)

In [None]:
Train.head()

In [None]:
Train.tail()

In [None]:
from scipy.stats import variation as var
sum = 0.0
for col in Test.columns[:-1]:
    sum+=var(Train[col])
    print(col,' : ',var(Train[col]))


In [None]:
#相关丢弃feature的函数
def drop_low_var_values(data,threshold):
    labels = []
    for col in data.columns:
        if data[col].dtype != 'object':
            if var(data[col]) >= threshold:
                labels.append(col)
        else:
            labels.append(col)
    new_data = data[labels]
    print("newdata shape is",new_data.shape)
    return new_data

In [None]:
Train_data = drop_low_var_values(Train,1.2)
Test_data = Test[Train_data.columns[:-1]]

In [None]:
from sklearn.model_selection import train_test_split
def split_data(test_size,data):
    data = data.sample(frac=1)
    x_train = data.drop('target',1)
    y_1 = data['target']
    x_train = x_train.to_numpy()
    y_1 = y_1.to_numpy()
    X_train , X_val , y_1 , y_2 = train_test_split( x_train , y_1 ,
                                                         test_size = test_size ,
                                                        random_state =1 ,
                                                        stratify = y_1)
    y_train = []
    y_val = []
    for value in y_1:
        y_train.append(int(value[-1])-1)
    for value in y_2:
        y_val.append(int(value[-1])-1)
    return X_train , X_val , np.array(y_train) , np.array(y_val)


In [None]:
X_train , X_val , y_train , y_val = split_data(0.2,Train_data)

In [None]:
X_train.shape

In [None]:
X_val.shape

In [None]:
y_train.shape

In [None]:
y_val.shape

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import BatchNormalization,Dense,Dropout

In [None]:
X_train[0].shape

In [None]:
num_classes = 4 
keras.backend.clear_session()              
model = Sequential()
model.add(Dense(32 , activation = 'relu' , input_shape = X_train[0].shape))
model.add(Dense(64 , activation = 'relu' ))
model.add(Dense(64 , activation = 'relu' ))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(64 , activation = 'relu' ))
model.add(Dense(64 , activation = 'relu' ))
model.add(Dense(128 , activation = 'relu' ))
model.add(Dropout(0.2))  
model.add(BatchNormalization())
model.add(Dense(128 , activation = 'relu'))
# model.add(Dense(44 , activation = 'relu' , input_shape = X_train[0].shape))
model.add(Dense(64 , activation = 'relu' ))
model.add(Dense(num_classes , activation = 'softmax'))   

In [None]:
model.summary()

In [None]:
model.compile(optimizer = Adam() , 
              loss = 'sparse_categorical_crossentropy' , 
              metrics = ['accuracy'])

In [None]:
epochs = 5
validation_data = (X_val,y_val)

history = model.fit(X_train,y_train , batch_size = 10 ,epochs =epochs , validation_data = validation_data ,verbose = 1)

In [None]:
ax,fig = plt.subplots(1,2, figsize=(20,6))
plt.suptitle('Model Performance',size = 25)
plt.subplot(1,2,1)
plt.plot(history.history['loss'] , label = 'train')
plt.plot(history.history['val_loss'] , label = 'validation')
plt.legend()
plt.title('Loss')

plt.subplot(1,2,2)
plt.plot(history.history['accuracy'] , label = 'train')
plt.plot(history.history['val_accuracy'] , label = 'validation')
plt.legend()
plt.title('Accuracy')
plt.show()

In [None]:
labels = ['Class_1' , 'Class_2' , 'Class_3' , 'Class_4']
ts_id = test_data['id']
pred = model.predict(Test_data)
pred = pd.DataFrame(pred,columns = labels)
pred = pd.concat([ts_id,pred] , axis = 1)
pred.to_csv('ann2.csv',index=False)