## Import Packages

In [96]:
import csv
import numpy as np
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

## Extract Data

In [97]:
def findCommunities(filename):
    communities = []
    with open(filename, 'r') as csvfile:
        filereader = csv.reader(csvfile)
        j = 0
        for i in filereader:
            if j > 0:
                if i[1] not in communities:
                    communities.append(i[1])
            j+=1
    return communities
def extractData(filename,communities,addstate = False,addcommunity = False):
    features = []
    xPositive=[]
    yPositive=[]
    xNegative=[]
    yNegative =[]
    with open(filename, 'r') as csvfile:
        filereader = csv.reader(csvfile)
        j = 0
        for i in filereader:
            if j > 0:
                data = i[3:len(i)-1]
                if addstate:
                    state = [0.0]*56
                    state[int(i[0])-1] = 1.0
                    data = data + state
                    features = features + ["state"+str(i) for i in range(56)]
                if addcommunity:
                    tmp = [0.0]*len(communities)
                    tmp[communities.index(i[1])] = 1.0
                    data = data + tmp
                    features = features + communities
                vect = np.array(data)
                #vect = np.array(i[3:len(i)-1])
                #x.append(vect.astype(np.float))
                if float(i[-1]) > 0.1:
                    xPositive.append(vect.astype(np.float))
                    yPositive.append(1)
                else:
                    xNegative.append(vect.astype(np.float))
                    yNegative.append(0)
            else:
                features = i[3:len(i)-1]
            j+=1
        """yPositive = np.array(yPositive)
        xPositive = np.array(xPositive)
        yNegative = np.array(yNegative)
        xNegative = np.array(xNegative)"""
        features = np.array(features)
        indxP = int(len(xPositive)*.6)
        indxN = int(len(xNegative)*.6)
        xTrain =np.array(xPositive[:indxP] + xNegative[:indxN])
        yTrain = np.array(yPositive[:indxP] + yNegative[:indxN])
        xTest = np.array(xPositive[indxP:] + xNegative[indxN:])
        yTest = np.array(yPositive[indxP:] + yNegative[indxN:])
        
        mean_diff = (np.absolute(np.mean(xPositive[:indxP],axis = 0)-np.mean(xNegative[:indxN],axis = 0)))
        std_sum = (np.std(xPositive[:indxP],axis = 0)+np.std(xNegative[:indxN],axis = 0))
        feval = mean_diff/std_sum
            
            
        return xTrain,yTrain,xTest,yTest,features,feval

## Calculate Label Percentage

In [98]:
def calcualtePercentage(labels):
    n = len(labels)
    sumTrue = sum(labels)*1.0
    sumFalse = (n - sumTrue)*1.0
    return sumTrue/n,sumFalse/n

## Create Model

In [99]:
def gaussianNB():
    return GaussianNB()

In [100]:
def linearSVC():
    return LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', 
                     fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None,
                     max_iter=1000)

## Evaluation

In [101]:
def confusionMatrix(true,predicted,labels = [1,0]):
    return confusion_matrix(true,predicted,labels)

def find_confusion_matrix(actual,predicted,clabels):
    cm= []
    for i in clabels:
        tmp =[0]*len(clabels)
        
        for j in range(len(actual)):
            if actual[j]== i and actual[j] == predicted[j]:
                tmp[clabels.index(i)] += 1
            elif actual[j]== i and actual[j] != predicted[j]:
                tmp[clabels.index(predicted[j])] += 1
        cm.append(tmp)
    return np.array(cm)
def find_accuracy(matrix):
    return np.trace(matrix)*1.0/np.sum(matrix)
def find_precision(matrix):
    pres = []
    x = np.sum(matrix,axis=0)
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            if i == j:
                pres.append(matrix[i][j]*1.0/x[i])
    return pres
def find_recall(matrix):
    rec = []
    x = np.sum(matrix,axis=1)
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            if i == j:
                rec.append(matrix[i][j]*1.0/x[i])
    return rec
def evaluation(acutal,predicted,clabels=[1,0]):
    confmatrix = find_confusion_matrix(acutal,predicted,clabels)
    print ("Confusion Matrix")
    print (confmatrix)
    accuracy = find_accuracy(confmatrix)
    print ("Accuracy", accuracy)
    precision = find_precision(confmatrix)
    print ("Precision", precision)
    recall = find_recall(confmatrix)
    print ("Recall", recall)

## Cross Validation

In [102]:
def do_cross_validation(X, y,clf, n_folds=5):
    cv = KFold(len(y), n_folds, shuffle=True)
    accuracies = []
    i = 1
    for train_ind, test_ind in cv: 
        clf.fit(X[train_ind], y[train_ind])
        predictions = clf.predict(X[test_ind])
        print ("Fold %d Evaluation" %i)
        i +=1
        evaluation(y[test_ind],predictions)
        print ("\n")
        accuracies.append(accuracy_score(y[test_ind], predictions))
    avg = np.mean(accuracies)
    print ("The average Accuracy is ", avg)

### Read Data

In [103]:
filename = 'C:\\Users\\ar1\\Documents\ML\\Project\\Crime Prediction Data(1)\\Crime Prediction Data\\communities-crime-clean.csv'
distintCommunities = findCommunities(filename)
XTrain,YTrain,XTest,YTest,Features,FeatureEval = extractData(filename, distintCommunities)

### Percentage of Labels

In [104]:
print ("Train Percentage",calcualtePercentage(YTrain))
print ("Test Percentage",calcualtePercentage(YTest))

Train Percentage (0.62761506276150625, 0.3723849372384937)
Test Percentage (0.62656641604010022, 0.37343358395989973)


### Default GaussianNB Model

In [105]:
model= gaussianNB()
model.fit(XTrain,YTrain)
predictedY=model.predict(XTest)
evaluation(YTest,predictedY)

Confusion Matrix
[[354 146]
 [ 40 258]]
Accuracy 0.766917293233
Precision [0.89847715736040612, 0.63861386138613863]
Recall [0.70799999999999996, 0.86577181208053688]


### 10 Fold Cross Validation on Default GaussianNB Model

In [106]:
do_cross_validation(XTrain,YTrain,model,10)

Fold 1 Evaluation
Confusion Matrix
[[44 24]
 [ 4 48]]
Accuracy 0.766666666667
Precision [0.91666666666666663, 0.66666666666666663]
Recall [0.6470588235294118, 0.92307692307692313]


Fold 2 Evaluation
Confusion Matrix
[[56 15]
 [ 3 46]]
Accuracy 0.85
Precision [0.94915254237288138, 0.75409836065573765]
Recall [0.78873239436619713, 0.93877551020408168]


Fold 3 Evaluation
Confusion Matrix
[[49 20]
 [ 4 47]]
Accuracy 0.8
Precision [0.92452830188679247, 0.70149253731343286]
Recall [0.71014492753623193, 0.92156862745098034]


Fold 4 Evaluation
Confusion Matrix
[[51 27]
 [ 5 37]]
Accuracy 0.733333333333
Precision [0.9107142857142857, 0.578125]
Recall [0.65384615384615385, 0.88095238095238093]


Fold 5 Evaluation
Confusion Matrix
[[59 20]
 [ 6 35]]
Accuracy 0.783333333333
Precision [0.90769230769230769, 0.63636363636363635]
Recall [0.74683544303797467, 0.85365853658536583]


Fold 6 Evaluation
Confusion Matrix
[[55 24]
 [ 4 36]]
Accuracy 0.764705882353
Precision [0.93220338983050843, 0.5999999

### Top 10 important features in GaussianNB model

In [107]:
topFeature = np.argsort(FeatureEval)[::-1][:10]
print (Features[topFeature],"--",topFeature)

['PctKids2Par' 'PctFam2Par' 'FemalePctDiv' 'racePctWhite' 'TotalPctDiv'
 'PctIlleg' 'PctPersDenseHous' 'pctWInvInc' 'MalePctDivorce'
 'PctYoungKids2Par'] -- [44 43 40  3 41 50 68 15 38 45]


### Default LinearSVC Model

In [108]:
model= linearSVC()
model.fit(XTrain,YTrain)
predictedY=model.predict(XTest)
evaluation(YTest,predictedY)

Confusion Matrix
[[413  87]
 [ 64 234]]
Accuracy 0.810776942356
Precision [0.86582809224318658, 0.7289719626168224]
Recall [0.82599999999999996, 0.78523489932885904]


### 10 Fold Cross Validation on Default LinearSVC Model

In [109]:
do_cross_validation(XTrain,YTrain,model,10)

Fold 1 Evaluation
Confusion Matrix
[[65 15]
 [ 6 34]]
Accuracy 0.825
Precision [0.91549295774647887, 0.69387755102040816]
Recall [0.8125, 0.84999999999999998]


Fold 2 Evaluation
Confusion Matrix
[[66  9]
 [11 34]]
Accuracy 0.833333333333
Precision [0.8571428571428571, 0.79069767441860461]
Recall [0.88, 0.75555555555555554]


Fold 3 Evaluation
Confusion Matrix
[[65  8]
 [12 35]]
Accuracy 0.833333333333
Precision [0.8441558441558441, 0.81395348837209303]
Recall [0.8904109589041096, 0.74468085106382975]


Fold 4 Evaluation
Confusion Matrix
[[65  9]
 [10 36]]
Accuracy 0.841666666667
Precision [0.8666666666666667, 0.80000000000000004]
Recall [0.8783783783783784, 0.78260869565217395]


Fold 5 Evaluation
Confusion Matrix
[[68 19]
 [ 2 31]]
Accuracy 0.825
Precision [0.97142857142857142, 0.62]
Recall [0.7816091954022989, 0.93939393939393945]


Fold 6 Evaluation
Confusion Matrix
[[58  5]
 [17 39]]
Accuracy 0.81512605042
Precision [0.77333333333333332, 0.88636363636363635]
Recall [0.920634920634

### Top 10 important features in LinearSVC model

In [111]:
coef = model.coef_
topindx = np.argsort(coef)[0][::-1][:10]
print (Features[topindx])


['PctPersDenseHous' 'agePct12t21' 'PersPerOccupHous' 'MalePctDivorce'
 'racepctblack' 'PctImmigRecent' 'NumUnderPov' 'PctUnemployed' 'NumStreet'
 'PctSpeakEnglOnly']
