In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [59]:
data=pd.read_csv('../input/nys-environmental-remediation-sites/environmental-remediation-sites.csv')

In [60]:
data.info()

In [61]:
data.isna().sum()

In [62]:
null_columns=data.loc[:,data.isna().sum()>0.25*data.shape[0]]
data=data.drop(null_columns,axis=1)

In [63]:
data


In [64]:
data.isna().sum()

In [65]:
unneeded_columns=['Program Number','Project Name','Program Facility Name','Address1','Locality','ZIPCode','SWIS Code','Owner Name','Owner Address1','Owner City','Owner State','Owner ZIP','Georeference','Contaminants']

data=data.drop(unneeded_columns,axis=1)

In [66]:
data

In [67]:
def get_uniques(df,columns):
    return {column:list(df[column].unique()) for column in columns}

def get_categorical_columns(df):
    return [column for column in df.columns if df.dtypes[column]=='object']

In [68]:
get_uniques(data,get_categorical_columns(data))

In [69]:
data['Project Completion Date']=data['Project Completion Date'].apply(lambda x:x[0:7] if str(x)!='nan' else x)

data['Year']=data['Project Completion Date'].apply(lambda x:np.float(x[0:4]) if str(x)!='nan' else x)
data['Month']=data['Project Completion Date'].apply(lambda x:np.float(x[5:7]) if str(x)!='nan' else x)
data.drop('Project Completion Date',axis=1,inplace=True)

In [70]:
data.isna().sum()
data.dtypes

In [71]:
cols=['New York Zip Codes 2','Counties 2','Year','Month']

for column in cols:
    data[column]=data[column].fillna(data[column].mean())
    


In [72]:
data.isna().sum()

In [73]:
def onehot_encode(df,column):
    df=df.copy()
    dummies=pd.get_dummies(df[column])
    df=pd.concat([df,dummies],axis=1)
    df=df.drop(column,axis=1)
    return df

In [74]:
nominal_features=get_categorical_columns(data)
nominal_features.remove('Program Type')

for feature in nominal_features:
    data=onehot_encode(data,feature)

In [78]:
data

In [79]:
(data.dtypes=='object').sum()

In [80]:
label_encoder=LabelEncoder()

data['Program Type']=label_encoder.fit_transform(data['Program Type'])

In [81]:
y=data['Program Type']
x=data.drop('Program Type',axis=1)

In [82]:
scaler=StandardScaler()

x=pd.DataFrame(scaler.fit_transform(x),index=x.index,columns=x.columns)

In [83]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)

In [87]:
x.shape

In [88]:
y.value_counts()

In [96]:
inputs=tf.keras.Input(shape=(118,))
x=tf.keras.layers.Dense(64,activation='relu')(inputs)
x=tf.keras.layers.Dense(54,activation='relu')(x)
outputs=tf.keras.layers.Dense(5,activation='softmax')(x)

model=tf.keras.Model(inputs=inputs,outputs=outputs)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


batch_size=64
epochs=55

history=model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[tf.keras.callbacks.ReduceLROnPlateau()],
    verbose=0
)

In [97]:
fig=px.line(
    history.history,y=['loss','val_loss'],
    labels={'index':"Epochs",'value':"Loss"},
    title="Training and Validation Loss"
)

fig.show()

In [98]:
np.argmin(history.history['val_loss'])

In [99]:
model.evaluate(x_test,y_test)