In [1]:
# import dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
# read in file, view
df = pd.read_csv('ML_data_set.csv')
df.head()


Unnamed: 0,Zip_Code,Total_Population,Median_Income,num_of_fast_food,num_of_mex_restaurants,Labels,Brand,Address,City,State,Probability
0,601.0,17599.0,11757.0,,,0.0,,0,0,0,
1,602.0,39209.0,16190.0,,,0.0,,0,0,0,
2,603.0,50135.0,16645.0,,,0.0,,0,0,0,
3,606.0,6304.0,13387.0,,,0.0,,0,0,0,
4,610.0,27590.0,18741.0,,,0.0,,0,0,0,


In [3]:
# drop zip code
#df2 = df.drop("Zip_Code", axis=1)
# drop non-features
df2 = df.drop(["Brand","Address","City","State","Probability"], axis=1)
# remove negative numbers in Median Income
df2 = df2[(df2['Median_Income'] >= 0) | (df2['Median_Income'].isnull())]
# drop NaN rows
df2 = df2.dropna()

df2.describe()

Unnamed: 0,Zip_Code,Total_Population,Median_Income,num_of_fast_food,num_of_mex_restaurants,Labels
count,20206.0,20206.0,20206.0,20206.0,20206.0,20206.0
mean,39844.418341,9772.492379,55804.923538,156.32035,98.818915,0.059982
std,25977.968287,14179.396001,23356.426111,253.499781,192.223122,0.23746
min,1001.0,25.0,2499.0,0.0,0.0,0.0
25%,18058.25,897.0,41199.0,24.0,15.0,0.0
50%,35033.5,3120.0,50938.0,69.0,40.0,0.0
75%,59005.75,13040.25,63928.5,184.0,104.0,0.0
max,99922.0,114647.0,250001.0,6200.0,6100.0,1.0


In [4]:
# Create target vector
y = df2.Labels
# Create binary target vector indicating if class 0
y = np.where((y == 0), 0, 1)

In [5]:
# Indicies of each class' observations
i_class0 = np.where(y == 0)[0]
i_class1 = np.where(y == 1)[0]

# Number of observations in each class
n_class0 = len(i_class0)
n_class1 = len(i_class1)

# For every observation of class 1, randomly sample from class 0 without replacement
i_class0_downsampled = np.random.choice(i_class0, size=n_class1, replace=False)

# Join together class 1's target vector with the downsampled class 0's target vector
NewVector = np.hstack((y[i_class1], y[i_class0_downsampled]))

In [6]:
# Add classifier to each new vector
vector1 = pd.DataFrame(i_class1)
vector1 = vector1.rename(columns={ vector1.columns[0]: "Index" })
vector1['class'] = 1

vector0 = pd.DataFrame(i_class0_downsampled)
vector0 = vector0.rename(columns={ vector0.columns[0]: "Index" })
vector0['class'] = 0

In [7]:
# New dfConcat downsampled and classified vectors
df_downsample = pd.concat([vector1, vector0])
df_downsample.head()

Unnamed: 0,Index,class
0,10,1
1,14,1
2,21,1
3,26,1
4,50,1


In [8]:
# Add features to new df
df3 = df_downsample.join(df, on='Index')
#df4 = df3.drop("Zip_Code", axis=1)
df4 = df3.drop(["Brand","Address","City","State","Probability", "Index", "Labels"], axis=1)
# remove negative numbers in Median Income
df4 = df4[(df4['Median_Income'] >= 0) | (df4['Median_Income'].isnull())]
# drop NaN rows
df4 = df4.dropna()

In [9]:
# Define ML features (X) and label (y)
X = df4.drop("class", axis=1)
y = df4["class"]

formula = 'y ~ Total_Population+Median_Income+num_of_fast_food+num_of_mex_restaurants+Zip_Code'
print(X.shape, y.shape)

(1920, 5) (1920,)


In [10]:
# Import GLM dependancy
import statsmodels.formula.api as smf

In [11]:
model = smf.glm(formula = formula, data=df4, family=sm.families.NegativeBinomial())
result = model.fit()
print(result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                 1920
Model:                            GLM   Df Residuals:                     1914
Model Family:        NegativeBinomial   Df Model:                            5
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1869.2
Date:                Thu, 16 May 2019   Deviance:                       974.20
Time:                        20:19:08   Pearson chi2:                     611.
No. Iterations:                     5   Covariance Type:             nonrobust
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -0.6145      0.143     -4.282      0.000      -0.896      -0.333
Total_Population

In [12]:
print("Coefficeients")
print(result.params)
print()
print("p-Values")
print(result.pvalues)
print()
print("Dependent variables")
print(result.model.endog_names)

Coefficeients
Intercept                -0.614483
Total_Population         -0.000001
Median_Income             0.000002
num_of_fast_food         -0.000148
num_of_mex_restaurants    0.000411
Zip_Code                 -0.000005
dtype: float64

p-Values
Intercept                 0.000019
Total_Population          0.674975
Median_Income             0.333062
num_of_fast_food          0.614362
num_of_mex_restaurants    0.366065
Zip_Code                  0.060626
dtype: float64

Dependent variables
y


In [13]:
# DF with non-CMG locations
X2 = df[df['Labels'] != 1]
X2 = X2.drop(["Brand","Address","City","State","Probability","Labels"], axis=1)
# remove negative numbers in Median Income
X2 = X2[(X2['Median_Income'] >= 0) | (X2['Median_Income'].isnull())]
# drop NaN rows
X2 = X2.dropna()
#X2 = X2.set_index('Zip_Code')
X2.describe()

Unnamed: 0,Zip_Code,Total_Population,Median_Income,num_of_fast_food,num_of_mex_restaurants
count,18994.0,18994.0,18994.0,18994.0,18994.0
mean,40121.350374,8214.487154,54771.12067,141.993261,88.82426
std,25898.400544,12449.625219,22581.03438,237.964331,174.455476
min,1001.0,25.0,2499.0,0.0,0.0
25%,18079.25,826.0,40820.0,22.0,14.0
50%,35749.5,2684.5,50417.0,62.0,37.0
75%,59294.25,10077.0,62625.0,164.0,94.0
max,99922.0,114647.0,250001.0,6200.0,6100.0


In [14]:
# Run non-Chipotle location details through model and predict probability of opening in a new market
predictions = result.predict(X2)

estimates = pd.DataFrame(predictions)

In [15]:
df5 = estimates.join(X2)
df5.head()

Unnamed: 0,0,Zip_Code,Total_Population,Median_Income,num_of_fast_food,num_of_mex_restaurants
131,0.573775,1001.0,17537.0,57694.0,87.0,30.0
132,0.56427,1002.0,30280.0,52379.0,38.0,33.0
133,0.53384,1003.0,11131.0,2499.0,18.0,14.0
134,0.771062,1005.0,5014.0,70325.0,7.0,632.0
135,0.602992,1007.0,14906.0,86165.0,255.0,97.0


In [16]:
# export to csv
df5.to_csv("Predicted_Locations.csv")

In [19]:
ZC = 27829
TP = 25000
MI = 65000
FF = 200
MR = 35

new = pd.DataFrame({'Zip_Code': [ZC],'Total_Population': [TP],'Median_Income': [MI],'num_of_fast_food': [FF],'num_of_mex_restaurants': [MR]})

new_data = pd.DataFrame(new)

In [20]:
predictions = result.predict(new_data)
predictions

0    0.49293
dtype: float64