# Crime Prediction

In [56]:
import pandas
import numpy
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree, svm, linear_model, preprocessing
from sklearn.pipeline import Pipeline
import io
from scipy import misc
import pydotplus
import graphviz
import matplotlib.pyplot as plt
from IPython.display import Image
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.naive_bayes import GaussianNB

In [57]:
oriDF = pandas.read_csv("communities-crime-clean.csv")
oriDF

Unnamed: 0,state,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,...,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn,ViolentCrimesPerPop
0,1,Alabastercity,7,0.01,0.61,0.21,0.83,0.02,0.01,0.41,...,0.03,0.70,0.40,0.34,0.57,0.05,0.06,0.01,0.00,0.06
1,1,AlexanderCitycity,10,0.01,0.41,0.55,0.57,0.01,0.00,0.47,...,0.00,0.93,0.66,0.82,0.84,0.11,0.03,0.01,0.00,0.14
2,1,Annistoncity,3,0.03,0.34,0.86,0.30,0.04,0.01,0.41,...,0.04,0.77,0.59,0.70,0.64,0.06,0.11,0.04,0.00,1.00
3,1,Athenscity,8,0.01,0.38,0.35,0.71,0.04,0.01,0.39,...,0.03,0.78,0.56,0.67,0.71,0.09,0.05,0.00,0.00,0.23
4,1,Auburncity,1,0.04,0.37,0.32,0.70,0.21,0.02,1.00,...,0.12,0.49,0.12,0.00,0.15,0.09,0.09,0.01,0.00,0.15
5,1,Bessemercity,6,0.04,0.44,1.00,0.10,0.00,0.00,0.43,...,0.00,0.96,0.74,0.95,0.89,0.11,0.07,0.13,0.00,1.00
6,1,Birminghamcity,2,0.41,0.37,1.00,0.02,0.03,0.01,0.41,...,0.03,0.90,0.64,0.86,0.82,0.43,0.15,0.20,0.38,1.00
7,1,Cullmancity,1,0.01,0.30,0.00,0.99,0.02,0.01,0.38,...,0.00,0.82,0.54,0.72,0.76,0.04,0.07,0.01,0.00,0.16
8,1,Daphnecity,7,0.00,0.39,0.31,0.75,0.02,0.02,0.35,...,0.03,0.60,0.46,0.40,0.54,0.03,0.08,0.01,0.00,0.05
9,1,Decaturcity,10,0.06,0.39,0.32,0.73,0.04,0.01,0.39,...,0.03,0.71,0.47,0.61,0.60,0.14,0.09,0.01,0.00,0.22


## 1: Decision Tree Classification

### a:

In [58]:
oriDF['highCrime'] = (oriDF['ViolentCrimesPerPop'] > 0.1)
oriDF[['ViolentCrimesPerPop', 'highCrime']]

Unnamed: 0,ViolentCrimesPerPop,highCrime
0,0.06,False
1,0.14,True
2,1.00,True
3,0.23,True
4,0.15,True
5,1.00,True
6,1.00,True
7,0.16,True
8,0.05,False
9,0.22,True


In [59]:
noofT = oriDF['highCrime'].value_counts()[0]
noofF = oriDF['highCrime'].value_counts()[1]
totalVals = noofT + noofF
print("Percent of \"True\" instances are: "+ str(format(float((noofT*100)/totalVals), '.2f')) + " %")
print("Percent of \"False\" instances are: "+ str(format(float((noofF*100)/totalVals), '.2f')) + " %")

Percent of "True" instances are: 37.28 %
Percent of "False" instances are: 62.72 %


### b: Decision Tree Classifier

In [60]:
allCols = list(oriDF)

exclu = ['state', 'communityname', 'fold', 'ViolentCrimesPerPop', 'highCrime']
inclu = [item for item in allCols if item not in exclu]

modDF = oriDF[inclu]
modDF.data = modDF.copy()
modDF.target = oriDF['highCrime']

In [61]:
model = DecisionTreeClassifier()
model.fit(modDF.data, modDF.target)
print(model)

expected = modDF.target
predicted = model.predict(modDF.data)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')


In [62]:
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

             precision    recall  f1-score   support

      False       1.00      1.00      1.00       743
       True       1.00      1.00      1.00      1250

avg / total       1.00      1.00      1.00      1993

[[ 743    0]
 [   0 1250]]


In [63]:
important_features = pandas.DataFrame(model.feature_importances_, columns = ["Importance"], index = modDF.data.columns).sort_values("Importance", ascending=False)
print("Importance of Features(Top 10)")
important_features[:10]

Importance of Features(Top 10)


Unnamed: 0,Importance
PctKids2Par,0.358652
racePctWhite,0.087891
racePctHisp,0.048585
PctLess9thGrade,0.020673
PctEmplManu,0.020141
PctSpeakEnglOnly,0.01498
PctImmigRec8,0.013952
PctHousOwnOcc,0.013686
PctNotHSGrad,0.013448
MedRent,0.01329


In [64]:
def show_tree(decisionTree, file_path):
    dotfile = io.StringIO()
    export_graphviz(decisionTree, out_file=dotfile)
    pydotplus.graph_from_dot_data(dotfile.getvalue()).write_png(file_path)
    i = misc.imread(file_path)
    plt.imshow(i)

# To use it
show_tree(model, 'test.png')

<img src="test.png">

Important Features Summary:

In the Decision tree above we can see that the highest impportance is given to the feature "PctKids2Par". The decision tree may predict states that the crime rate is high where percent of kids having both parents is low, for this we can come to the conclusion that the single parents might be inclined towards comitting crime to support their children all by him/her self. 

The other defining attirbutes also include percent of different races living in that area and we can also see education of the people is also an importanrt facter. Despite these attributes which doesnt make any meaning like amount of people working in manifacturing still we might want to consider the percent of people under and above poverty line because it makes sense.

### b: Decision Tree Classifier with 10 fold cross validation

In [65]:
model1 = DecisionTreeClassifier()
print(model1)

def APR(model):
    accuracy = cross_val_score(model, modDF.data, modDF.target, cv = 10, scoring='accuracy')
    precision = cross_val_score(model, modDF.data, modDF.target, cv = 10, scoring='precision')
    recall = cross_val_score(model, modDF.data, modDF.target, cv = 10, scoring='recall')
    print("Accuracy: %0.2f (+/- %0.2f)" % (accuracy.mean(), accuracy.std() * 2))
    print("Precision: %0.2f (+/- %0.2f)" % (precision.mean(), precision.std() * 2))
    print("Recall: %0.2f (+/- %0.2f)" % (recall.mean(), recall.std() * 2))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')


#### i) Accuracy, Percision and Recall for Decision Tree Classifier with 10 fold cross validation

In [66]:
APR(model1)

Accuracy: 0.72 (+/- 0.09)
Precision: 0.79 (+/- 0.05)
Recall: 0.77 (+/- 0.18)


#### ii) Alalysis on Accuracy, Percision and Recall

In the initial decision tree classification we can see that all Accuracy, Percision and Recall were 100 % because the model built was overfitted to the training set but once we applied cross validation the data was split into train and test and the model was buit on that and now the Accuracy, Percision and Recall for the new model went down more than by 20 % and the new model is not overfitted towrds the training set and thus we can assume that the new model will perform better when encountered with new datapoints.

## 2: Linear Classification

### a. Naive Bayes Classifier 

#### i) Naive Bayes Classification with 10 fold Cross Validation

In [67]:
gnbModel = GaussianNB()

print(gnbModel)

GaussianNB(priors=None)


In [68]:
APR(gnbModel)

Accuracy: 0.76 (+/- 0.11)
Precision: 0.91 (+/- 0.12)
Recall: 0.69 (+/- 0.18)


#### ii) Top 10 Features according to their Predictiveness

In [69]:
def Predictiveness(cname):
    tVals = numpy.array(oriDF.loc[oriDF.highCrime == True, cname])
    fVals = numpy.array(oriDF.loc[oriDF.highCrime == False, cname])
    nume = abs(tVals.mean() - fVals.mean())
    deno = tVals.std() + fVals.std()
    val = float('%.4f' % (nume/deno))
    return val

def PredictiveFeatures(df):
    tupList = []
    colNames = list(df.columns.values)
    for cname in colNames:
        tup = (cname, Predictiveness(cname))
        tupList.append(tup)
    return tupList

labels = ['Feature', 'Predictiveness']
PredictiveFeaturesList = pandas.DataFrame.from_records(PredictiveFeatures(modDF), columns=labels).sort_values("Predictiveness", ascending=False)
PredictiveFeaturesList[:10]

Unnamed: 0,Feature,Predictiveness
44,PctKids2Par,0.8097
43,PctFam2Par,0.7455
3,racePctWhite,0.7352
50,PctIlleg,0.7093
40,FemalePctDiv,0.694
41,TotalPctDiv,0.6746
45,PctYoungKids2Par,0.665
15,pctWInvInc,0.6611
46,PctTeen2Par,0.6429
38,MalePctDivorce,0.6169


In the above result we can see the the feature "PctKids2Par" is counted as the most predictive among other feature and this is same the most important feature from the decision tree classification. This phenomenon makes sense because the formule on which the predictiveness is directly proportional to the difference in the means of features with their respective classes this is because the mean represents the data spread of that feature for that respective class hence the higher the difference between means the easier to differentiate between the classes.

#### iii) Comparison between the Feature Predictiveness and Important Features from Decision Tree Classification

Form the above results we can see that their are many features differences between "Features with high Predictiveness" and "Important Features" but we can see that there are two common features('PctKids2Par', 'racePctWhite') in the top three features from both lists by which we can assume that these features have a big take in the decision making because they showed up in both ways of finding important features.

### b) linear Support Vector Classification

#### i) Linear Support Vector Classification with 10 fold Cross Validation

In [70]:
linearModel = svm.LinearSVC()
print(linearModel)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


In [71]:
APR(linearModel)

Accuracy: 0.80 (+/- 0.11)
Precision: 0.85 (+/- 0.10)
Recall: 0.83 (+/- 0.23)


In [72]:
linearModel.fit(modDF.data, modDF.target)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

#### ii) Feature Weights - Top 10

In [73]:
feature_weights = pandas.DataFrame(abs(linearModel.coef_[0]), columns = ["Weight"], index = modDF.data.columns).sort_values("Weight", ascending=False)
feature_weights[:10]

Unnamed: 0,Weight
pctWInvInc,1.888486
PersPerOccupHous,1.755127
racePctWhite,1.500219
PctKids2Par,1.190329
RentHighQ,1.066885
MalePctDivorce,1.065696
NumUnderPov,1.051548
NumStreet,1.019155
PctOccupMgmtProf,1.014671
population,1.002301


In the above result, we can see attributes having the absolute weights. The highest weight corresponds to the attribute “PersPerOccupHous” which tells us the number of persons living per household and this could affect the crime rate in that area. There are some other attributes which also contribute to the crime rate like races and poverty line but there are some attributes with high weights but doesn’t quit affect the crime rate like “pctWInvInc: percentage of households with investment / rent income in 1989” but we should not just rule this out because they might indirectly affect the crime rate. From this analysis, we can see that the weights of the attributes can give the importance of that attribute in decision making and this is because the coefficients represent the vector orthogonal to the linear hyper plane which separates the data and the higher the coefficient’s absolute value the further it is away from the plane and the further it is away gives more info on classification of the data points. Thus, these weights give us the most predictive features in the dataset.

#### iii) Comparison between the Feature Weights and Important Features from Decision Tree Classification

When we compare the above results with the top 10 important features from decision tree classification we can see that we have some new attributes which we have high weights/importance like ‘PersPerOccupHous’, ‘pctWInvInc’, ‘NumStreet’, etc., these attributes did not show up in the previous but managed to rack up high weights and the thing to observe is that we have some attributes which are common from both list of features like ‘racePctWhite’ and ‘PctKids2Par’. These feature managed to get into the most influencing attributes lists from different classification techniques and this tells us that these attributes have a influencing vote on the deciding the classification of a data point.

## 3: Regression

### a) Linear Regression

In [74]:
linRegModel = linear_model.LinearRegression()
linRegModel.fit(modDF.data, oriDF['ViolentCrimesPerPop'])
lrmExpec = oriDF['ViolentCrimesPerPop'].copy()
lrmPredic = linRegModel.predict(modDF.data)

#### i) Mean Squared Error with 10 fold Cross Validation

In [75]:
mse = cross_val_score(linear_model.LinearRegression(), modDF.data, oriDF['ViolentCrimesPerPop'], cv=10, scoring='neg_mean_squared_error')
print("Mean Squared Error: %0.2f (+/- %0.2f)" % (mse.mean(), mse.std() * 2))

Mean Squared Error: -0.02 (+/- 0.01)


#### ii) Mean Squared Error

In [76]:
mse1 = metrics.mean_squared_error(lrmExpec, lrmPredic)
print("Mean Squared Error: %0.4f" % mse1)

Mean Squared Error: 0.0165


#### iii) 

##### Top 10 Features for predicting High Crime Rate

In [77]:
feature_weights_pVe = pandas.DataFrame(linRegModel.coef_, columns = ["Weight"], index = modDF.data.columns).sort_values("Weight", ascending=False)
feature_weights_pVe[:10]

Unnamed: 0,Weight
PersPerOccupHous,0.635088
PctHousOwnOcc,0.568133
MalePctDivorce,0.458517
PctRecImmig8,0.432511
MedRent,0.372728
medFamInc,0.287979
PctEmploy,0.248474
MalePctNevMarr,0.226728
PctPersDenseHous,0.214353
OwnOccMedVal,0.212876


##### Top 10 Features for predicting Low Crime Rate

In [78]:
feature_weights_nVe = pandas.DataFrame(linRegModel.coef_, columns = ["Weight"], index = modDF.data.columns).sort_values("Weight", ascending=True)
feature_weights_nVe[:10]

Unnamed: 0,Weight
PctPersOwnOccup,-0.675694
TotalPctDiv,-0.561924
whitePerCap,-0.351016
PctKids2Par,-0.322651
OwnOccLowQuart,-0.30817
numbUrban,-0.296443
PersPerRentOccHous,-0.254572
RentLowQ,-0.234752
agePct12t29,-0.229218
PctRecImmig5,-0.218221


### b) Ridge Regression

In [79]:
#alpha=[10.0, 1.0, 0.1, 0.01, 0.001]
rModel = linear_model.Ridge()
rModel.fit(modDF.data, oriDF['ViolentCrimesPerPop'])
rExpec = oriDF['ViolentCrimesPerPop'].copy()
rPredic = rModel.predict(modDF.data)

#### i) Mean Squared Error with 10 fold Cross Validation

In [80]:
rMse = cross_val_score(rModel, modDF.data, oriDF['ViolentCrimesPerPop'], cv=10, scoring='neg_mean_squared_error')
print("Estimated Mean Squared Error: %0.2f (+/- %0.2f)" % (rMse.mean(), rMse.std() * 2))

Estimated Mean Squared Error: -0.02 (+/- 0.01)


#### ii) Mean Squared Error

In [81]:
rMse1 = metrics.mean_squared_error(rExpec, rPredic)
print("Mean Squared Error: %0.4f" % rMse1)

Mean Squared Error: 0.0168


#### iii) Best Alpha

In [82]:
rcvModel = linear_model.RidgeCV(alphas=[10.0, 1.0, 0.1, 0.01, 0.001])
rcvModel.fit(modDF.data, oriDF['ViolentCrimesPerPop'])
print("Best Alpha from [10.0, 1.0, 0.1, 0.01, 0.001] is: %0.1f" % rcvModel.alpha_)

Best Alpha from [10.0, 1.0, 0.1, 0.01, 0.001] is: 1.0


#### iv)

The MSE computed when we used Linear Regression is 0.0165 and when we used Ridge Regression then is 0.0168 we can observe that there is an increase in the squared error. When a model is overfitted to the training set then the squared error is closer towards 0 and 0 when the model is completely overfitted. So, in this scenario we can see that the MSE for Linear Regression is lower than Ridge thus we can assume that Linear Regression model is more overfitted than the Ridge model for this dataset. 

### c) Polynomial Regression

#### i) Mean Squared Error with 10 fold Cross Validation

In [83]:
pModel = Pipeline([('poly', preprocessing.PolynomialFeatures(degree=2)), ('linear', linear_model.LinearRegression())])
pModel.fit(modDF.data, oriDF['ViolentCrimesPerPop'])
pExpec = oriDF['ViolentCrimesPerPop'].copy()
pPredic = pModel.predict(modDF.data)

In [84]:
pMse = cross_val_score(pModel, modDF.data, oriDF['ViolentCrimesPerPop'], cv=10, scoring='neg_mean_squared_error')
print("Mean Squared Error: %0.2f (+/- %0.2f)" % (pMse.mean(), pMse.std() * 2))

Mean Squared Error: -0.13 (+/- 0.08)


#### ii) Mean Squared Error

In [85]:
pMse1 = metrics.mean_squared_error(pExpec, pPredic)
print("Mean Squared Error: %0.4f" % pMse1)
pMse1

Mean Squared Error: 0.0000


8.1944380154569338e-29

#### iii)

The linear regression will draw a linear plane to classify any new data points but as for the polynomial regression will form a polynomial plane which will try to fit to the plane to the data points this could lead to reduced sum of squares or even overfitting. When we have scored, the mean squared error for the dataset using the polynomial regression and we can see that the mean squared error went down very close to 0.0 when we compare with the linear regression and this can be because of the overfitting in the polynomial regression and we are trying to predict a numeric value rather than the True/False on which we have done in the previous experiments. But if applied the cross validation when the model is built we can see that the linear regression is being over fitted because the mean squared error is closer to zero but the cross-validation score for ploy is not that far from zero. From this analysis above we can consider linear regression to be a better fit because chance of overfitting is more in polynomial regression. 

## 4: Dirty Data

In [86]:
oriDF2 = pandas.read_csv("communities-crime-full.csv")
oriDF2 = oriDF2.replace('?', numpy.NaN)
oriDF2

Unnamed: 0,state,county,community,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,8,,,Lakewoodcity,1,0.19,0.33,0.02,0.90,0.12,...,0.12,0.26,0.20,0.06,0.04,0.9,0.5,0.32,0.14,0.20
1,53,,,Tukwilacity,1,0.00,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,,,,,0.00,,0.67
2,24,,,Aberdeentown,1,0.00,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,,,,,0.00,,0.43
3,34,5,81440,Willingborotownship,1,0.04,0.77,1.00,0.08,0.12,...,0.02,0.39,0.28,,,,,0.00,,0.12
4,42,95,6096,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,,,,,0.00,,0.03
5,6,,,SouthPasadenacity,1,0.02,0.28,0.06,0.54,1.00,...,0.01,0.58,0.10,,,,,0.00,,0.14
6,44,7,41500,Lincolntown,1,0.01,0.39,0.00,0.98,0.06,...,0.05,0.08,0.06,,,,,0.00,,0.03
7,6,,,Selmacity,1,0.01,0.74,0.03,0.46,0.20,...,0.01,0.33,0.00,,,,,0.00,,0.55
8,21,,,Hendersoncity,1,0.03,0.34,0.20,0.84,0.02,...,0.04,0.17,0.04,,,,,0.00,,0.53
9,29,,,Claytoncity,1,0.01,0.40,0.06,0.87,0.30,...,0.00,0.47,0.11,,,,,0.00,,0.15


In [87]:
oriDF2['highCrime'] = (oriDF2['ViolentCrimesPerPop'] > 0.1)
oriDF2[['ViolentCrimesPerPop', 'highCrime']]

Unnamed: 0,ViolentCrimesPerPop,highCrime
0,0.20,True
1,0.67,True
2,0.43,True
3,0.12,True
4,0.03,False
5,0.14,True
6,0.03,False
7,0.55,True
8,0.53,True
9,0.15,True


In [88]:
noofT2 = oriDF2['highCrime'].value_counts()[0]
noofF2 = oriDF2['highCrime'].value_counts()[1]
totalVals2 = noofT2 + noofF2
print("Percent of \"True\" instances are: "+ str(format(float((noofT2*100)/totalVals2), '.2f')) + " %")
print("Percent of \"False\" instances are: "+ str(format(float((noofF2*100)/totalVals2), '.2f')) + " %")

Percent of "True" instances are: 37.26 %
Percent of "False" instances are: 62.74 %


In [89]:
allCols2 = list(oriDF)

exclu2 = ['state', 'communityname', 'fold', 'ViolentCrimesPerPop', 'highCrime']
inclu2 = [item for item in allCols2 if item not in exclu2]

modDF2 = oriDF2[inclu2]
modDF2.data = modDF2.copy()
modDF2.target = oriDF2['highCrime']

values = modDF2.data.values
# fill missing values with mean column values
imputer = preprocessing.Imputer()
transformed_modDF2_data = imputer.fit_transform(values)







In [90]:
model2 = DecisionTreeClassifier()
model2.fit(transformed_modDF2_data, modDF2.target)
print(model2)

expected2 = modDF2.target
predicted2 = model2.predict(transformed_modDF2_data)

print(metrics.classification_report(expected2, predicted2))
print(metrics.confusion_matrix(expected2, predicted2))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
             precision    recall  f1-score   support

      False       1.00      1.00      1.00       743
       True       1.00      1.00      1.00      1251

avg / total       1.00      1.00      1.00      1994

[[ 743    0]
 [   0 1251]]


In [91]:
important_features2 = pandas.DataFrame(model2.feature_importances_, columns = ["Importance"], index = modDF2.data.columns).sort_values("Importance", ascending=False)
print("Importance of Features(Top 10)")
important_features2[:10]


Importance of Features(Top 10)


Unnamed: 0,Importance
PctKids2Par,0.36137
racePctWhite,0.088089
racePctHisp,0.045107
PctEmplManu,0.025833
PctLess9thGrade,0.020817
PctHousOwnOcc,0.014161
PctEmploy,0.01336
HousVacant,0.011952
OtherPerCap,0.011732
householdsize,0.011265


In [92]:
accuracy = cross_val_score(model2,transformed_modDF2_data, modDF2.target, cv = 10, scoring='accuracy')
precision = cross_val_score(model2, transformed_modDF2_data, modDF2.target, cv = 10, scoring='precision')
recall = cross_val_score(model2, transformed_modDF2_data, modDF2.target, cv = 10, scoring='recall')
print("Accuracy: %0.2f (+/- %0.2f)" % (accuracy.mean(), accuracy.std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.75 (+/- 0.05)
Precision: 0.81 (+/- 0.05)
Recall: 0.79 (+/- 0.05)


### a) Analysis:

On comparison with the previous scores of Accuracy, Precision and Recall of the clean dataset and the scores from above dirty dataset we can see that the scores have improved. This is because when handling the missing we used imputing to fill out the missing values and due to this we have more information to classify the tree unlike the clean dataset where eliminated the missing values. Thus we have obtained better Accuracy, Precision and Recall scores.

## 5:(6 a) Extra Credit

### 5. a) 

### i) Non-Linear Kernel for SVM

#### Clena Data

In [93]:
svm3Model = svm.SVC(degree = 4)
svm3Model.fit(modDF.data, modDF.target)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=4, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [94]:
APR(svm3Model)

Accuracy: 0.82 (+/- 0.10)
Precision: 0.86 (+/- 0.11)
Recall: 0.86 (+/- 0.20)


#### Full Data

In [95]:
accuracy11 = cross_val_score(svm3Model,transformed_modDF2_data, modDF2.target, cv = 10, scoring='accuracy')
precision11 = cross_val_score(svm3Model, transformed_modDF2_data, modDF2.target, cv = 10, scoring='precision')
recall11 = cross_val_score(svm3Model, transformed_modDF2_data, modDF2.target, cv = 10, scoring='recall')
print("Accuracy: %0.2f (+/- %0.2f)" % (accuracy11.mean(), accuracy11.std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (precision11.mean(), precision11.std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (recall11.mean(), recall11.std() * 2))

Accuracy: 0.83 (+/- 0.02)
Precision: 0.86 (+/- 0.03)
Recall: 0.87 (+/- 0.04)


### Random Forest

#### Clena Data

In [96]:
rfModel = RandomForestClassifier()
rfModel

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [97]:
APR(rfModel)

Accuracy: 0.79 (+/- 0.11)
Precision: 0.84 (+/- 0.09)
Recall: 0.81 (+/- 0.23)


#### Full Data

In [98]:
accuracy12 = cross_val_score(svm3Model,transformed_modDF2_data, modDF2.target, cv = 10, scoring='accuracy')
precision12 = cross_val_score(svm3Model, transformed_modDF2_data, modDF2.target, cv = 10, scoring='precision')
recall12 = cross_val_score(svm3Model, transformed_modDF2_data, modDF2.target, cv = 10, scoring='recall')
print("Accuracy: %0.2f (+/- %0.2f)" % (accuracy12.mean(), accuracy11.std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (precision12.mean(), precision11.std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (recall12.mean(), recall11.std() * 2))

Accuracy: 0.83 (+/- 0.02)
Precision: 0.86 (+/- 0.03)
Recall: 0.87 (+/- 0.04)


### ii) Result Comparisions

When we compare the Accuracy, Percision adn Recall for a non lenear model SVM and a Random Forest and we can see that the scores are better when we used non lenear model.  

### iii) Predictive Features

In [99]:
rfModel.fit(modDF.data, modDF.target)
important_features3 = pandas.DataFrame(rfModel.feature_importances_, columns = ["Importance"], index = modDF2.data.columns).sort_values("Importance", ascending=False)
print("Importance of Features(Top 10)")
important_features3[:10]

Importance of Features(Top 10)


Unnamed: 0,Importance
NumIlleg,0.099234
PctKids2Par,0.065218
PctPersDenseHous,0.061595
PctTeen2Par,0.060323
PctFam2Par,0.044441
racepctblack,0.030988
pctWInvInc,0.026689
PctPersOwnOccup,0.026129
racePctHisp,0.025919
FemalePctDiv,0.022035


We can see that from the various lists of top predictive features using various models which we have a set of common attributes which show up in various lists, some of these are ‘PctFam2Par’, ‘PctKids2Par’, ‘racePctWhite’ and these attributes being able to predict the final predictions seems to be very reliable.  