In [34]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [35]:
# read the Manzana.csv file
df = pd.read_csv('ManzanaDHLDefinitivo.csv')
df.dropna(inplace=True)
# Replace the * with 0
df = df[df.ne('*').all(1)]
# Drop the CVEGEO and AMBITO columns
df.drop(['CVEGEO', 'AMBITO'], axis=1, inplace=True)
# Erase the N/D values
df = df[df.ne('N/D').all(1)]
df = df.astype(float)
# Separate Lon and Lat columns
Y_train = df[['inDHL']].values
X_train = df.drop(['inDHL'], axis=1).values
df.head()

Unnamed: 0.1,Unnamed: 0,TIPOMZA,POBTOT,POB0_14_P,P15A29A_P,P30A59A_P,P_60YMAS_P,GRAPROES,VIVTOT,TVIPAHAB,TVIPAHAB_P,VPH_C_EL_P,VPH_EXSA_P,VPH_DREN_P,PARATRAN_C,TRANSCOL_C,Lon,Lat,inDHL
2,2.0,4.0,30.0,23.33,26.67,33.33,16.67,7.35,9.0,8.0,88.89,100.0,100.0,100.0,3.0,3.0,-100.395308,25.945133,0.0
4,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,-100.390815,25.94425,0.0
5,5.0,4.0,34.0,32.35,14.71,47.06,5.88,10.78,16.0,10.0,62.5,100.0,100.0,100.0,3.0,3.0,-100.391641,25.944584,0.0
6,6.0,4.0,88.0,25.0,31.82,37.5,5.68,9.21,25.0,22.0,88.0,100.0,100.0,100.0,3.0,3.0,-100.392461,25.944786,0.0
8,8.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,-100.402177,25.945854,0.0


In [36]:
def Missing_Values(data):
    variable_name=[]
    total_value=[]
    total_missing_value=[]
    missing_value_rate=[]
    unique_value_list=[]
    total_unique_value=[]
    data_type=[]
    for col in data.columns:
        variable_name.append(col)
        data_type.append(data[col].dtype)
        total_value.append(data[col].shape[0])
        total_missing_value.append(data[col].isnull().sum())
        missing_value_rate.append(round(data[col].isnull().sum()/data[col].shape[0],3))
        unique_value_list.append(data[col].unique())
        total_unique_value.append(len(data[col].unique()))
    missing_data=pd.DataFrame({"Variable":variable_name,"Total_Value":total_value,\
                             "Total_Missing_Value":total_missing_value,"Missing_Value_Rate":missing_value_rate,
                             "Data_Type":data_type,"Unique_Value":unique_value_list,\
                               "Total_Unique_Value":total_unique_value})
    return missing_data.sort_values("Missing_Value_Rate",ascending=False)

Missing_Values(df)


Unnamed: 0,Variable,Total_Value,Total_Missing_Value,Missing_Value_Rate,Data_Type,Unique_Value,Total_Unique_Value
0,Unnamed: 0,66982,0,0.0,float64,"[2.0, 4.0, 5.0, 6.0, 8.0, 9.0, 10.0, 11.0, 12....",66982
10,TVIPAHAB_P,66982,0,0.0,float64,"[88.89, 0.0, 62.5, 88.0, 83.33, 76.92, 63.64, ...",1745
17,Lat,66982,0,0.0,float64,"[25.94513325, 25.94425029, 25.94458396, 25.944...",66924
16,Lon,66982,0,0.0,float64,"[-100.395308, -100.390815, -100.391641, -100.3...",62854
15,TRANSCOL_C,66982,0,0.0,float64,"[3.0, 7.0, 8.0, 2.0, 1.0, 9.0]",6
14,PARATRAN_C,66982,0,0.0,float64,"[3.0, 7.0, 8.0, 2.0, 1.0, 9.0]",6
13,VPH_DREN_P,66982,0,0.0,float64,"[100.0, 0.0, 88.89, 87.5, 80.0, 75.0, 66.67, 8...",359
12,VPH_EXSA_P,66982,0,0.0,float64,"[100.0, 0.0, 88.89, 90.0, 85.71, 91.67, 87.5, ...",365
11,VPH_C_EL_P,66982,0,0.0,float64,"[100.0, 0.0, 87.5, 92.31, 80.0, 66.67, 75.0, 9...",295
9,TVIPAHAB,66982,0,0.0,float64,"[8.0, 0.0, 10.0, 22.0, 5.0, 7.0, 9.0, 20.0, 14...",236


In [37]:
# Split the data into training and testing
Y_test = Y_train[0:5000]
X_test = X_train[0:5000]
Y_train = Y_train[5000:]
X_train = X_train[5000:]

In [38]:
# # standardize the data
# Y_trains = (Y_train - Y_train.mean()) / Y_train.std()
# X_trains = (X_train - X_train.mean()) / X_train.std()
# Y_tests = (Y_test - Y_test.mean()) / Y_test.std()
# X_tests = (X_test - X_test.mean()) / X_test.std()

In [39]:
Y_test[:10]

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [44]:
import tensorflow as tf 
import keras 

model = keras.Sequential([
    keras.layers.Dense(64, activation=tf.nn.relu, input_shape=(X_train.shape[1],)),
    keras.layers.Dense(64, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

model.compile(optimizer=tf.optimizers.Adam(),
                loss='binary_crossentropy',
                metrics=['accuracy'])

model.fit(X_train, Y_train, epochs=5, batch_size=32)






Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x28329228b20>

In [45]:
#show the loss and accuracy of the model
# print('Test accuracy:', test_acc)




In [60]:
# Create a new dataframe with the test data and the predictions
df_pred = pd.DataFrame(X_test, columns=df.columns[:-1])
df_pred['Actual'] = Y_test
#show Prediction as 0 or 1
df_pred['Prediction'] = (model.predict(X_test) > 0.5).astype("float32")
df_pred.head()

df_pred



Unnamed: 0.1,Unnamed: 0,TIPOMZA,POBTOT,POB0_14_P,P15A29A_P,P30A59A_P,P_60YMAS_P,GRAPROES,VIVTOT,TVIPAHAB,TVIPAHAB_P,VPH_C_EL_P,VPH_EXSA_P,VPH_DREN_P,PARATRAN_C,TRANSCOL_C,Lon,Lat,Actual,Prediction
0,2.0,4.0,30.0,23.33,26.67,33.33,16.67,7.35,9.0,8.0,88.89,100.0,100.0,100.0,3.0,3.0,-100.395308,25.945133,0.0,0.0
1,4.0,4.0,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.0,3.0,3.0,-100.390815,25.944250,0.0,1.0
2,5.0,4.0,34.0,32.35,14.71,47.06,5.88,10.78,16.0,10.0,62.50,100.0,100.0,100.0,3.0,3.0,-100.391641,25.944584,0.0,1.0
3,6.0,4.0,88.0,25.00,31.82,37.50,5.68,9.21,25.0,22.0,88.00,100.0,100.0,100.0,3.0,3.0,-100.392461,25.944786,0.0,0.0
4,8.0,4.0,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.0,3.0,3.0,-100.402177,25.945854,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,5800.0,4.0,56.0,23.21,23.21,48.21,5.36,11.79,16.0,15.0,93.75,100.0,100.0,100.0,3.0,3.0,-100.211765,25.795830,0.0,0.0
4996,5801.0,4.0,502.0,28.49,25.50,42.43,3.59,11.70,151.0,135.0,89.40,100.0,100.0,100.0,3.0,3.0,-100.214848,25.796563,0.0,0.0
4997,5802.0,4.0,149.0,26.17,25.50,42.28,6.04,12.62,52.0,41.0,78.85,100.0,100.0,100.0,3.0,3.0,-100.217893,25.800406,0.0,0.0
4998,5803.0,4.0,72.0,31.94,23.61,44.44,0.00,13.16,19.0,19.0,100.00,100.0,100.0,100.0,3.0,3.0,-100.228988,25.809101,0.0,0.0


In [64]:
# #load the model
# model.save('DhlporManzana.h5')

In [66]:
#using df_pred plot only the prediction that are different from the actual value 
import folium
import numpy as np
from geopy.distance import geodesic

points = []
for i in range(len(df_pred)):
    if df_pred['Actual'][i] != df_pred['Prediction'][i]:
        points.append([df_pred['Lat'][i], df_pred['Lon'][i]])

# Create a map
m = folium.Map(location=[25.656114182076717, -100.3065789670519], zoom_start=12)
# Add the points to the map
for point in range(0, len(points)):
    folium.Marker(points[point], popup=df_pred['Prediction'][point]).add_to(m)


realDhlpoints = []
dfdhl = pd.read_csv('Database/DHL_NuevoLeon.csv')
for i in range(len(dfdhl)):
    realDhlpoints.append([dfdhl['latitud'][i], dfdhl['longitud'][i]])

# add to the map the real DHL points with other color
for point in range(0, len(realDhlpoints)):
    folium.Marker(realDhlpoints[point], popup="DHL", icon=folium.Icon(color='green')).add_to(m)


m


# Create a list of the predictions



In [71]:
# logistic.fit(X_train, y_train)
# y_pred = logistic.predict(X_test)
# print("Matriz de Confusión para ", name, confusion_matrix(y_test,y_pred))
# print("Reporte de Clasificación para ", name, classification_report(y_test,y_pred))

#re do the previus code but with our data 


from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


print("Matriz de Confusión para ", 'Binary Cross Entropy', confusion_matrix(Y_test,df_pred['Prediction']))
print("Reporte de Clasificación para ", 'Binary Cross Entropy', classification_report(Y_test,df_pred['Prediction']))




Matriz de Confusión para  Binary Cross Entropy [[3486 1278]
 [  59  177]]
Reporte de Clasificación para  Binary Cross Entropy               precision    recall  f1-score   support

         0.0       0.98      0.73      0.84      4764
         1.0       0.12      0.75      0.21       236

    accuracy                           0.73      5000
   macro avg       0.55      0.74      0.52      5000
weighted avg       0.94      0.73      0.81      5000



In [72]:
#summary of the model
model.summary()


Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_42 (Dense)            (None, 64)                1216      
                                                                 
 dense_43 (Dense)            (None, 64)                4160      
                                                                 
 dense_44 (Dense)            (None, 1)                 65        
                                                                 
Total params: 5,441
Trainable params: 5,441
Non-trainable params: 0
_________________________________________________________________
