## Import Packages

In [87]:
import csv
import numpy as np
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures

## Extract Data

In [88]:
def findCommunities(filename):
    communities = []
    with open(filename, 'r') as csvfile:
        filereader = csv.reader(csvfile)
        j = 0
        for i in filereader:
            if j > 0:
                if i[1] not in communities:
                    communities.append(i[1])
            j+=1
    return communities
def extractData(filename,communities,addstate = False,addcommunity = False):
    features = []
    x =[]
    y=[]
    with open(filename, 'r') as csvfile:
        filereader = csv.reader(csvfile)
        j = 0
        for i in filereader:
            if j > 0:
                data = i[3:len(i)-1]
                if addstate:
                    state = [0.0]*56
                    state[int(i[0])-1] = 1.0
                    data = data + state
                    features = features + ["state"+str(i) for i in range(56)]
                if addcommunity:
                    tmp = [0.0]*len(communities)
                    tmp[communities.index(i[1])] = 1.0
                    data = data + tmp
                    features = features + communities
                vect = np.array(data)
                #vect = np.array(i[3:len(i)-1])
                #x.append(vect.astype(np.float))
                x.append(vect.astype(np.float))
                y.append(float(i[-1]))
            else:
                features = i[3:len(i)-1]
            j+=1
        features = np.array(features)
        indx = int(len(x)*.65)
        xTrain =np.array(x[:indx])
        yTrain = np.array(y[:indx])
        xTest = np.array(x[indx:])
        yTest = np.array(y[indx:])
        
            
        return xTrain,yTrain,xTest,yTest,features

## Create Model

In [89]:
def polyData(Data,degree):
    poly = PolynomialFeatures(degree)
    return poly.fit_transform(Data)

In [90]:
def ridgeRegression(alpha = 1.0):
    return Ridge(alpha=alpha, fit_intercept=True, normalize=False, copy_X=True,
                 max_iter=None, tol=0.001, solver='auto', random_state=None)

In [91]:
def linearRegression():
    return LinearRegression()

## Evaluation - Mean Squared Error

In [92]:
def meanSquaredError(true,predict):
    print("Mean squared error: %.4f"
      % np.mean((predict-true) ** 2))

In [93]:
def meanSquaredError1(true,predict):
    return np.mean((predict-true) ** 2)

## Cross Validation

In [94]:
def do_cross_validation(X, y,clf, n_folds=5):
    cv = KFold(len(y), n_folds, shuffle=True)
    accuracies = []
    i = 1
    mse = []
    for train_ind, test_ind in cv: 
        clf.fit(X[train_ind], y[train_ind])
        predictions = clf.predict(X[test_ind])
        print ("Fold %d Evaluation" %i)
        i +=1
        meanSquaredError(y[test_ind],predictions)
        mse.append(meanSquaredError1(y[test_ind],predictions))
        print ("\n")
    return np.mean(mse)

### Test Alpha

In [95]:
def testAlpha(XTrain,YTrain,XTest,YTest,alpha_list):
    for i in alpha_list:
        ridgeReg= ridgeRegression(alpha = i)
        ridgeReg.fit(XTrain,YTrain)
        predictedY=ridgeReg.predict(XTest)
        meanSquaredError(predictedY,YTest)

### Read Data

In [96]:
filename = 'C:\\Users\\ar1\\Documents\ML\\Project\\Crime Prediction Data(1)\\Crime Prediction Data\\communities-crime-clean.csv'
distintCommunities = findCommunities(filename)
XTrain,YTrain,XTest,YTest,Features = extractData(filename, distintCommunities)

### Default Linear Regression Model

In [97]:
model= linearRegression()
model.fit(XTrain,YTrain)
predictedY=model.predict(XTest)
meanSquaredError(YTest,predictedY)

Mean squared error: 0.0177


### 10 Fold Cross Validation on Default Linear Regression Model

In [98]:
do_cross_validation(XTrain,YTrain,model,10)

Fold 1 Evaluation
Mean squared error: 0.0197


Fold 2 Evaluation
Mean squared error: 0.0219


Fold 3 Evaluation
Mean squared error: 0.0222


Fold 4 Evaluation
Mean squared error: 0.0195


Fold 5 Evaluation
Mean squared error: 0.0242


Fold 6 Evaluation
Mean squared error: 0.0217


Fold 7 Evaluation
Mean squared error: 0.0276


Fold 8 Evaluation
Mean squared error: 0.0152


Fold 9 Evaluation
Mean squared error: 0.0209


Fold 10 Evaluation
Mean squared error: 0.0180




0.021081737431193504

### Top 10 important features in Linear Regression model

In [99]:
FeatureEval = model.coef_
topFeature = np.argsort(FeatureEval)[::-1][:10]
print (Features[topFeature])

['PersPerOccupHous' 'PctRecImmig8' 'MalePctNevMarr' 'PctPersDenseHous'
 'medFamInc' 'agePct12t21' 'MedRent' 'pctWSocSec' 'PctEmploy'
 'MalePctDivorce']


### MSE on all Data Linear Regression

In [81]:
X = np.concatenate((XTrain,XTest),axis=0)
Y = np.concatenate((YTrain,YTest),axis=0)
model.fit(X,Y)
predictedY=model.predict(X)
meanSquaredError(Y,predictedY)

Mean squared error: 0.0165


3-a-ii) What is the MSE on the training set (train on all the data then test on it all)?

Explaination : The MSE on the training set is 0.0177. For this case, we have considered the entire set provided. 

### Default Ridge Regression Model

In [82]:
model= ridgeRegression()
model.fit(XTrain,YTrain)
predictedY=model.predict(XTest)
meanSquaredError(YTest,predictedY)

Mean squared error: 0.0172


### 10 Fold Cross Validation on Default Ridge Regression Model

In [83]:
do_cross_validation(XTrain,YTrain,model,10)

Fold 1 Evaluation
Mean squared error: 0.0165


Fold 2 Evaluation
Mean squared error: 0.0202


Fold 3 Evaluation
Mean squared error: 0.0199


Fold 4 Evaluation
Mean squared error: 0.0199


Fold 5 Evaluation
Mean squared error: 0.0199


Fold 6 Evaluation
Mean squared error: 0.0195


Fold 7 Evaluation
Mean squared error: 0.0205


Fold 8 Evaluation
Mean squared error: 0.0233


Fold 9 Evaluation
Mean squared error: 0.0206


Fold 10 Evaluation
Mean squared error: 0.0218




0.020207582075637373

3-b-i.	What is the estimated MSE of the model under 10-fold CV?

The MSE of Ridge regression under 10 - fold CV can be found above.

### Top 10 important features in Ridge Regression model

In [84]:
coef = model.coef_
topindx = np.argsort(coef)[::-1][:10]
print (Features[topindx])


['PctPersDenseHous' 'NumStreet' 'MalePctNevMarr' 'racepctblack' 'MedRent'
 'PctIlleg' 'pctWSocSec' 'PctEmploy' 'MalePctDivorce' 'PctOccupManu']


### MSE on all Data Ridge Regression

In [57]:
X = np.concatenate((XTrain,XTest),axis=0)
Y = np.concatenate((YTrain,YTest),axis=0)
model.fit(X,Y)
predictedY=model.predict(X)
meanSquaredError(Y,predictedY)

Mean squared error: 0.0168


### Alpha Test

In [58]:
alphaList = [10,1,0.1,.01,.001]
testAlpha(XTrain,YTrain,XTest,YTest,alphaList)

Mean squared error: 0.0173
Mean squared error: 0.0172
Mean squared error: 0.0175
Mean squared error: 0.0177
Mean squared error: 0.0177


## Polynomial Features

In [61]:
poly = PolynomialFeatures(2)
polyXTrain = poly.fit_transform(XTrain,2)
polyXTest =  poly.transform(XTest,2)

In [62]:
model= ridgeRegression()
model.fit(polyXTrain,YTrain)
predictedY=model.predict(polyXTest)
meanSquaredError(YTest,predictedY)

Mean squared error: 0.0213


In [64]:
do_cross_validation(polyXTrain,YTrain,model,10)

Fold 1 Evaluation
Mean squared error: 0.0171


Fold 2 Evaluation
Mean squared error: 0.0281


Fold 3 Evaluation
Mean squared error: 0.0352


Fold 4 Evaluation
Mean squared error: 0.0234


Fold 5 Evaluation
Mean squared error: 0.0249


Fold 6 Evaluation
Mean squared error: 0.0203


Fold 7 Evaluation
Mean squared error: 0.0282


Fold 8 Evaluation
Mean squared error: 0.0198


Fold 9 Evaluation
Mean squared error: 0.0220


Fold 10 Evaluation
Mean squared error: 0.0224




0.024139470981724385

In [85]:
X = np.concatenate((polyXTrain,polyXTest),axis=0)
Y = np.concatenate((YTrain,YTest),axis=0)
model= ridgeRegression()
model.fit(X,Y)
predictedY=model.predict(X)
meanSquaredError(Y,predictedY)

Mean squared error: 0.0080
