In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
import statsmodels.formula.api as smf



#### Data

Train data obtained from FBI Uniform Crime Reporting on crimes in New York state in 2013.

link: https://ucr.fbi.gov/crime-in-the-u.s/2013/crime-in-the-u.s.-2013/tables/table-8/table-8-state-cuts/table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls

Test data set obtained from FBI Uniform Crime Reporting on crimes in New York state in 2012.

link: https://ucr.fbi.gov/crime-in-the-u.s/2012/crime-in-the-u.s.-2012/tables/8tabledatadecpdf/table-8-state-cuts/table_8_offenses_known_to_law_enforcement_by_new_york_by_city_2012.xls

In [None]:
def import_clean(train_filename,city_exclude):
    train_data = pd.read_csv(train_filename, header = 4)
    train_data.drop(train_data.tail(3).index,inplace=True)
    train_data['Population'] = train_data['Population'].str.replace(',', '')
    train_data['Rape\n(legacy\ndefinition)2'] = train_data['Rape\n(legacy\ndefinition)2'].str.replace(',', '')
    train_data['Property\ncrime'] = train_data['Property\ncrime'].str.replace(',', '')
    train_data['Property\ncrime'] = pd.to_numeric(train_data['Property\ncrime'])
    train_data['Population'] = pd.to_numeric(train_data['Population'])
    train_data['Rape\n(legacy\ndefinition)2'] = pd.to_numeric(train_data['Rape\n(legacy\ndefinition)2'])
    train_data['Robbery'] = train_data['Robbery'].str.replace(',', '')
    train_data['Robbery'] = pd.to_numeric(train_data['Robbery'])
    train_data = train_data[train_data["City"] != city_exclude]
    train_data.rename(columns={"Murder and\nnonnegligent\nmanslaughter":"Murder",'Rape\n(legacy\ndefinition)2':"Rape","Property\ncrime":"Property_Crime"},inplace=True)
    train_data = train_data[["City","Population","Murder","Rape","Property_Crime","Robbery"]]
    train_data= train_data.dropna()
    return train_data
    

In [None]:
def train_import_clean(train_data_filename,city_exclude):
    train_data = import_clean(train_data_filename,city_exclude)
    train_data["population_squared"] = train_data["Population"] **2 
    train_data["Murder"] = np.where(train_data.Murder >= 1, 1,0)
    train_data["Robbery"] = np.where(train_data.Rape >=1,1,0)
    train_data["Rape"] = np.where(train_data.Rape >=1,1,0)
    return train_data

In [None]:
def test_import_clean(test_data,city_exclude):
    raw_test_data = pd.read_csv(test_data, header = 4)
    test_data= raw_test_data
    test_data.drop(test_data.tail(3).index,inplace=True)
    test_data['Population'] = test_data['Population'].str.replace(',', '')
    test_data['Forcible\nrape'] = test_data['Forcible\nrape'].str.replace(',', '')
    test_data['Property\ncrime'] = test_data['Property\ncrime'].str.replace(',', '')
    test_data['Property\ncrime'] = pd.to_numeric(test_data['Property\ncrime'])
    test_data['Burglary'] = test_data['Burglary'].str.replace(',', '')
    test_data['Burglary'] = pd.to_numeric(test_data['Burglary'])  
    test_data['Population'] = pd.to_numeric(test_data['Population'])
    test_data['Forcible\nrape'] = pd.to_numeric(test_data['Forcible\nrape'])
    test_data['Robbery'] = test_data['Robbery'].str.replace(',', '')
    test_data['Robbery'] = pd.to_numeric(test_data['Robbery'])
    test_data = test_data[test_data["City"] != "New York"]
    test_data.rename(columns={"Murder and\nnonnegligent\nmanslaughter":"Murder",'Forcible\nrape':"Rape","Property\ncrime":"Property_Crime"},inplace=True)
    test_data = test_data[["City","Population","Murder","Rape","Property_Crime","Robbery","Burglary"]]
    test_data= test_data.dropna()
    test_data["population_squared"] = test_data["Population"] **2 
    test_data["Murder"] = np.where(test_data.Murder >= 1, 1,0)
    test_data["Robbery"] = np.where(test_data.Rape >=1,1,0)
    test_data["Rape"] = np.where(test_data.Rape >=1,1,0)
    test_data["Burglary"] = np.where(test_data.Burglary >=1,1,0)
    return test_data

In [None]:
#Run clean function then regression
def run_regr(train_filename,city_exclude):
    train_data = train_import_clean(train_filename,city_exclude)
    regr = linear_model.LinearRegression()
    Y = train_data["Property_Crime"].values.reshape(-1,1)
    X = train_data[["Population","population_squared","Murder","Robbery"]]
    regr.fit(X,Y)
    print('\nCoefficients: \n', regr.coef_)
    print('\nIntercept: \n', regr.intercept_)
    print('\nR-squared:')
    print(regr.score(X, Y))

In [2]:
#Training will be done on 2013 data
train_filename =  "New_York_Offenses_2013.csv" 
#Testig will be done on 2013 data
test_filename = "New_York_Offenses_2012.csv"

In [None]:
#Run training data regression including New York
run_regr(train_filename,city_exclude="")

In [None]:
#Run training data regression excluding New York
run_regr(train_filename,city_exclude="New York")

In [None]:
#Use statsmodels to see p-values
def run_statsmodels(train_file,city_exclude):
    train_data = train_import_clean(train_file,city_exclude)
    linear_formula = "Property_Crime ~ Population+population_squared+Murder+Robbery"
    lm = smf.ols(formula=linear_formula, data=train_data).fit()
    print(lm.rsquared)
    print(lm.pvalues)


In [None]:
#Stats models including New York City
run_statsmodels(Train,city_exclude="New York")

In [None]:
#Stats models excluding New York City
run_statsmodels(Train,city_exclude="")

In [None]:
#Function to train then test on another data set
def run_train_test(train_file,test_file,city_exclude):
    train_data = train_import_clean(train_file,city_exclude)
    test_data = test_import_clean(test_file,city_exclude)
    regr = linear_model.LinearRegression()
    train_y = train_data["Property_Crime"].values.reshape(-1,1)
    train_x = train_data[["Population","population_squared","Murder","Robbery"]]
    test_y = test_data["Property_Crime"].values.reshape(-1,1)
    test_x = test_data[["Population","population_squared","Murder","Robbery"]]    
    regr.fit(train_x,train_y)
    print('\nCoefficients: \n', regr.coef_)
    print('\nIntercept: \n', regr.intercept_)
    print('\nTrain R-squared:')
    print(regr.score(train_x, train_y))   
    print('\nTest R-squared:')
    print(regr.score(test_x, test_y))

In [None]:
#Train 2013 Data, Test 2012, NYC excluded
run_train_test(Train,Test,city_exclude="New York")

In [None]:
#Train 2013 Data, Test 2012, NYC included
run_train_test(Train,Test,city_exclude="")

In [None]:
#Holdout
def run_train_test_split(train_file,city_exclude):
    train_data = train_import_clean(train_file,city_exclude)
    regr = linear_model.LinearRegression()
    Y = train_data["Property_Crime"].values.reshape(-1,1)
    X = train_data[["Population","population_squared","Murder","Robbery"]]
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=34)
    regr.fit(x_train,y_train)
    y_predict = regr.predict(x_test)
    print('\nCoefficients: \n', regr.coef_)
    print('\nIntercept: \n', regr.intercept_)
    print('\nR-squared:')
    print(regr.score(x_test, y_test))

In [None]:
#Run holdout
run_train_test_split(Train,city_exclude="New York")

In [None]:
#Run KFold
def run_KFold(train_file,city_exclude):
    train_data = train_import_clean(train_file,city_exclude)
    regr = linear_model.LinearRegression()
    kf = cross_validation.KFold(n=len(train_data), n_folds=10)
    for train_index, test_index in kf:
        training_data, testing_data = train_data.iloc[train_index], train_data.iloc[test_index]
        training_Y = training_data["Property_Crime"].values.reshape(-1,1)
        training_X = training_data[["Population","population_squared","Murder","Robbery"]]
        testing_Y = testing_data["Property_Crime"].values.reshape(-1,1)
        testing_X = testing_data[["Population","population_squared","Murder","Robbery"]]    
        regr.fit(training_X,training_Y)
        print('\nR-squared:')
        print(regr.score(testing_X,testing_Y))

In [None]:
run_KFold(Train,city_exclude="New York")

As discussed I used various validation methods including KFold and holdouts.  I also brought in a different set of data from a different year and tested prediction on that data as well.  In regards to making adjustments to the model I went through a process of adding and removing features while monitoring the r squared.  I also used statsmodels to see the p-values to help me determine which features where benefiting the model and which weren't.