In [174]:
import pandas as pd
import requests
import io
import numpy as np
import random as rd

In [102]:
url_data = 'http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data'
url_desc = 'http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.names'
pd.options.display.max_rows = 6

In [132]:
data = requests.get(url_data).content
columns_name_data = requests.get(url_desc).content
columns_name = [k.split(' ')[1] for k in ((columns.getvalue()).split('\n')) if '@attribute' in k]
df = pd.read_csv(io.StringIO(data.decode('utf-8')), names = columns_name).replace('?',np.nan)

## Part 1

In [133]:
# prep dataset
for c in df:
    df[c] = pd.to_numeric(df[c], errors = 'ignore')
df

Unnamed: 0,state,county,community,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,8,,,Lakewoodcity,1,0.19,0.33,0.02,0.90,0.12,...,0.12,0.26,0.20,0.06,0.04,0.90,0.5,0.32,0.14,0.20
1,53,,,Tukwilacity,1,0.00,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,,,,,0.00,,0.67
2,24,,,Aberdeentown,1,0.00,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,,,,,0.00,,0.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1991,9,9.0,80070.0,Waterburytown,10,0.16,0.37,0.25,0.69,0.04,...,0.08,0.32,0.18,0.08,0.06,0.78,0.0,0.91,0.28,0.23
1992,25,17.0,72600.0,Walthamcity,10,0.08,0.51,0.06,0.87,0.22,...,0.03,0.38,0.33,0.02,0.02,0.79,0.0,0.22,0.18,0.19
1993,6,,,Ontariocity,10,0.20,0.78,0.14,0.46,0.24,...,0.11,0.30,0.05,0.08,0.04,0.73,0.5,1.00,0.13,0.48


### Replace Nan with mean (1.a)

In [134]:
df.fillna(df.mean())

Unnamed: 0,state,county,community,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,8,58.826829,46188.336597,Lakewoodcity,1,0.19,0.33,0.02,0.90,0.12,...,0.12,0.26,0.20,0.060000,0.040000,0.900000,0.500000,0.32,0.140000,0.20
1,53,58.826829,46188.336597,Tukwilacity,1,0.00,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,0.163103,0.076708,0.698589,0.440439,0.00,0.195078,0.67
2,24,58.826829,46188.336597,Aberdeentown,1,0.00,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,0.163103,0.076708,0.698589,0.440439,0.00,0.195078,0.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1991,9,9.000000,80070.000000,Waterburytown,10,0.16,0.37,0.25,0.69,0.04,...,0.08,0.32,0.18,0.080000,0.060000,0.780000,0.000000,0.91,0.280000,0.23
1992,25,17.000000,72600.000000,Walthamcity,10,0.08,0.51,0.06,0.87,0.22,...,0.03,0.38,0.33,0.020000,0.020000,0.790000,0.000000,0.22,0.180000,0.19
1993,6,58.826829,46188.336597,Ontariocity,10,0.20,0.78,0.14,0.46,0.24,...,0.11,0.30,0.05,0.080000,0.040000,0.730000,0.500000,1.00,0.130000,0.48


In [135]:
# list ratios of empty values per columns
print(df.isna().sum()/len(df))

state                  0.000000
county                 0.588766
community              0.590271
                         ...   
LemasPctOfficDrugUn    0.000000
PolicBudgPerPop        0.840020
ViolentCrimesPerPop    0.000000
Length: 128, dtype: float64


It would be a terrible idea to replace all Nan cell with the mean of the columns. Here's why:
    1. Some columns (e.g. county) are number representing classes. 
    2. Some nan can acutllay be approximated (e.g. PolicPerPop = PolicReqPerOffic / population).
    3. Some columns have too many Nans (e.g. PolicBudgPerPop = 84% nan) and can be discarted.

### What to do with Nan (1.b)

If the column represent clases...:
    1. and the class ID is truly important for the task and there are not too many of them we are trying to solve, apply one hot encoding.
    2. create a new class (e.g. -1).
If the column values can be calculated or approximated:
    1. replace their value with the calculated/approximated value.
If the column has too many Nans:
    1. Remove the column.
If the column has very little amount of Nans:
    1. Remove the observation.
    

### Better Method (1.c)

In [136]:
# Printing colums with Nans and ratios of nans/data
t = [print(c, df[c].isna().sum()/len(df)) for c in (df.columns[df.isnull().any()])]
print("--------------------------------------")

county 0.588766298897
community 0.590270812437
OtherPerCap 0.000501504513541
LemasSwornFT 0.840020060181
LemasSwFTPerPop 0.840020060181
LemasSwFTFieldOps 0.840020060181
LemasSwFTFieldPerPop 0.840020060181
LemasTotalReq 0.840020060181
LemasTotReqPerPop 0.840020060181
PolicReqPerOffic 0.840020060181
PolicPerPop 0.840020060181
RacialMatchCommPol 0.840020060181
PctPolicWhite 0.840020060181
PctPolicBlack 0.840020060181
PctPolicHisp 0.840020060181
PctPolicAsian 0.840020060181
PctPolicMinor 0.840020060181
OfficAssgnDrugUnits 0.840020060181
NumKindsDrugsSeiz 0.840020060181
PolicAveOTWorked 0.840020060181
PolicCars 0.840020060181
PolicOperBudg 0.840020060181
LemasPctPolicOnPatr 0.840020060181
LemasGangUnitDeploy 0.840020060181
PolicBudgPerPop 0.840020060181
--------------------------------------


Categorical Columns:

        county --> will replace nan with -1 (creating new category)
        community --> will be removed b/c according to the documentation, it is not predictive
__________
Columns with large amount of Nan:

        LemasSwornFT, LemasSwFTPerPop, LemasSwFTFieldOps, LemasSwFTFieldPerPop, LemasTotalReq, LemasTotReqPerPop, PolicReqPerOffic, PolicPerPop, RacialMatchCommPol, PctPolicWhite, PctPolicBlack, PctPolicHisp, PctPolicAsian, PctPolicMinor, OfficAssgnDrugUnits, NumKindsDrugsSeiz, PolicAveOTWorked, PolicCars, PolicOperBudg,LemasPctPolicOnPatr, LemasGangUnitDeploy, PolicBudgPerPop
                      --> will delete the column
            
__________
Others:

        OtherPerCap --> replaced with average

In [155]:
# Remove column with too many Nan
df = df.loc[:, df.isnull().mean() < .75]
# Remove 'community'
df = df.drop(labels = 'community', axis = 1)

In [158]:
# Replace Nan with -1 for 
df.OtherPerCap = df.OtherPerCap.fillna(df.OtherPerCap.mean())
df.county = df.county.fillna(-1)

This is better because:
    1. We only keep features that matter and have information in them.
    2. We keep all the information that 'county' can provide and signal the model that there are data that wasn't labeled.
    3. We add information by replacing 'OtherPerCap' nans with average.

### Save dataset

In [163]:
df.to_csv(path_or_buf='communities_modified.csv')

## Part 2

In [199]:
def make_k_fold_cross_val(dataframe, k):
    for i in range(k):
        val_ids = rd.sample(range(0, len(dataframe)), int(len(dataframe)/k))
        val_df = dataframe.iloc[val_ids]
        train_ids = list(set(range(0, len(dataframe))) - set(val_ids))
        train_df = dataframe.iloc[train_ids]
        df.to_csv(path_or_buf=('CandC_train'+ str(i) + '.csv'))
        df.to_csv(path_or_buf=('CandC_val'+ str(i) + '.csv'))
    return 
    

In [200]:
make_k_fold_cross_val(df, 5)

In [166]:
def train(df_train, df_val, w, learning_rate = 1e-6, print_ = False):
    # train and valication data
    xs, ys = df_train.x.values, df_train.y.values
    xs = np.hstack((np.ones_like(xs), xs)).reshape(2, len(xs)).T  # for matrix multiplication
    x_val, y_val = df_val.x.values, df_val.y.values
    x_val = np.hstack((np.ones_like(x_val), x_val)).reshape(2, len(x_val)).T  # for matrix multiplication

    # rolling window over MSE to stop when stature
    all_mse_val, all_mse_tr = [], []
    epoch = 0
    
    # Training loop, stoping based on rolling_mse
    while True:
        w -= learning_rate * loss(xs, ys, w)
        current_mse_val, current_mse_tr = mse(x_val, y_val, w), mse(xs, ys, w)

        # add MSEs and loss to lists
        all_mse_val.append(current_mse_val)
        all_mse_tr.append(current_mse_tr)        

        # evaluate model every N steps and stop if stature
        if epoch % patience == 0:
            if print_:
                print("Epoch", epoch)
                plot_fcts(xs,ys,w)
            if epoch >= patience:
                print("Epoch #%s: MSE = %s" % (epoch, current_mse_val))
                print(np.mean(all_mse_val[-patience:int(-patience/2)]) - np.mean(all_mse_val[int(-patience/2):]))
                if np.mean(all_mse_val[-patience:int(-patience/2)]) - np.mean(all_mse_val[int(-patience/2):]) < 0.00001:
                    # Not learning anymore, returning all mse
                    return all_mse_tr, all_mse_val, w, epoch
        
        epoch += 1

In [167]:
w_ = np.random.rand(2)*10
print(w_)
all_mse_train_, all_mse_val_, weights_, epoch_ = train(df_train, df_val, w_, 1e-6)

[ 7.58821857  4.10023313]


NameError: name 'df_train' is not defined

In [170]:
df[1,2,3]

KeyError: (1, 2, 3)