In [11]:
from __future__ import division, absolute_import, print_function

import numpy as np
import scipy as sp
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV #model_selection only works in python3
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV

from sklearn import svm

from sklearn.preprocessing import Imputer,OneHotEncoder,StandardScaler, MaxAbsScaler
from sklearn.feature_selection import SelectFromModel

import pandas as pd

from sklearn.pipeline import make_pipeline

from sklearn.metrics import r2_score



In [25]:
url = 'https://ndownloader.figshare.com/files/7586326'

raw_data = pd.read_csv(url)


In [26]:
raw_data.head()

Unnamed: 0,recid,boro,uf1_1,uf1_2,uf1_3,uf1_4,uf1_5,uf1_6,uf1_7,uf1_8,...,hflag4,hflag18,uf52h_h,uf52h_a,uf52h_b,uf52h_c,uf52h_d,uf52h_e,uf52h_f,uf52h_g
0,1,1,9,9,9,9,1,9,9,9,...,0,0,0,0,0,0,0,0,0,0
1,1,1,9,9,9,9,1,9,9,9,...,0,0,1,1,1,0,0,0,0,0
2,1,1,9,9,9,9,1,9,9,9,...,0,0,1,1,1,0,1,0,1,0
3,1,1,9,9,9,9,1,9,9,9,...,0,0,0,0,0,0,0,0,0,0
4,1,1,9,9,9,9,1,9,9,9,...,0,0,1,0,0,0,1,0,0,0


In [34]:
#delete 9999 which is missing value (can't train if there is no response variable)

raw_data = raw_data[raw_data.uf17 <= 9999]



In [35]:
np.shape(raw_data)

(10229, 197)

In [180]:
#pd.value_counts(data['boro'].values, sort=False)

print(raw_data.cd.value_counts())


1     1559
3     1427
2     1387
7     1254
5     1253
6     1220
10    1143
8     1105
4     1022
9      868
12     562
11     539
13     504
14     493
18     320
15     277
17     238
16     171
Name: cd, dtype: int64


In [19]:
##FEATURE SELECTION

# Categorical

cat_features = ['boro','cd']


# 8 is nan
cat_featured_added = ['uf1_1','uf1_2','uf1_3','uf1_4','uf1_5','uf1_6','uf1_7','uf1_8','uf1_9','uf1_10']
cat_featured_test_10 = ['uf1_11','uf1_12','uf1_13','uf1_14','uf1_15','uf1_16','uf1_35','uf1_17','uf1_18']
cat_featured_test_11 =['uf1_19','uf1_20','uf1_21','uf1_22','sc23','sc24','sc36','sc37','sc38']
cat_featured_test_12 = ['sc147','sc171','sc154','sc157','sc197','sc198','sc188','sc190','sc191','sc192','sc193','sc194','sc575'] 

#cat_simple = ['uf1_1','uf1_7','uf1_7','uf1_17']
#cat_simple1 = ['sc23','sc24','sc36','sc37','sc38']

#4 is nan
cat_featured_added_1 = ['sc114']

#no nan values
cat_featured_added_2 = ['sc149','sc152','sc153','sc155','sc156','sc158']

total_cat = cat_features + cat_featured_added_1 + cat_featured_added + cat_featured_test_10 + cat_featured_test_11 + cat_featured_test_12 

#total_cat_simple = cat_simple + cat_simple1 + cat_featured_test_12 + cat_featured_added_1 + cat_featured_added_2

#Continous

cont_features = ['sc150','sc151']

#98,99 nan, 13k of those
#monthly fees, real estate taxes (13.7K nan), stories in building
cont_added = ['uf11','uf23']

#9999 is nan (mostly monthly fees)
#sc186 9 is no breakdowns, change to 0

cont_added_1 = ['uf12','uf13','uf14','uf15','sc186']



# 8 is not reported
cont_added_2 = ['sc189','sc196','sc199']


#differences
#sc143,sc144,sc154

total_cont = cont_features + cont_added + cont_added_1 + cont_added_2 


## REPLACE INTEGERS WITH NaN's

X = raw_data[total_cat + total_cont]

#cont
X[cont_added] = X[cont_added].replace([99,98],np.nan)
X[cont_added_1] = X[cont_added_1].replace([9999,99999],np.nan)
X[cont_added_2 ] = X[cont_added_2 ].replace(8,np.nan)


#cat
X[cat_featured_added] = X[cat_featured_added].replace(8,np.nan)
X[cat_featured_test_10] = X[cat_featured_test_10].replace(8,np.nan)
X[cat_featured_test_11] = X[cat_featured_test_11].replace(8,np.nan)
X[cat_featured_test_12] = X[cat_featured_test_12].replace(8,np.nan)
X[cat_featured_added_1] = X[cat_featured_added_1].replace(8,np.nan)


## ONE HOT ENCODING (imputation needs to happen before this)

#turn categorical data into dummies
for i in total_cat:
    X[i] = X[i].astype('category')
  



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [611]:
X.sc147.value_counts()

2.0    7728
1.0    1030
3.0     521
Name: sc147, dtype: int64

In [562]:
raw_data.uf8.value_counts()


9999    10229
Name: uf8, dtype: int64

In [554]:
raw_data.dtypes


recid        int64
boro         int64
uf1_1        int64
uf1_2        int64
uf1_3        int64
uf1_4        int64
uf1_5        int64
uf1_6        int64
uf1_7        int64
uf1_8        int64
uf1_9        int64
uf1_10       int64
uf1_11       int64
uf1_12       int64
uf1_13       int64
uf1_14       int64
uf1_15       int64
uf1_16       int64
uf1_35       int64
uf1_17       int64
uf1_18       int64
uf1_19       int64
uf1_20       int64
uf1_21       int64
uf1_22       int64
sc23         int64
sc24         int64
sc36         int64
sc37         int64
sc38         int64
            ...   
fw         float64
chufw        int64
seqno        int64
flg_sx1      int64
flg_ag1      int64
flg_hs1      int64
flg_rc1      int64
hflag2       int64
hflag1       int64
hflag13      int64
hflag6       int64
hflag3       int64
hflag14      int64
hflag16      int64
hflag7       int64
hflag9       int64
hflag10      int64
hflag91      int64
hflag11      int64
hflag12      int64
hflag4       int64
hflag18     

In [20]:



#IMPUTATION (REPLACING NAN'S)

# imp = Imputer(missing_values='NaN',strategy='most_frequent',axis=0)
# imp.fit(X_train)

# # #LASSO MODEL

# lasso = Lasso(alpha=0.001).fit(X_train,y_train)
# print(lasso.score(X_train,y_train))
# print (lasso.score(X_test,y_test))
# print(np.sum(lasso.coef_ != 0))

#print (X_dummies.head())

#scores = cross_val_score(lasso,X_dummies,y,cv=5)
#print(scores)

# #TRAIN TEST SPLIT

#train test split
y=raw_data['uf17']

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1)

X_train = pd.get_dummies(X_train) 
X_test = pd.get_dummies(X_test) 

#feature selection

select_lassocv = SelectFromModel(LassoCV(),threshold = 'median')

#pipeline

pipe = make_pipeline(Imputer(missing_values='NaN',strategy='most_frequent'),MaxAbsScaler(),select_lassocv,Lasso(alpha=.001))
scores = cross_val_score(pipe,X_train,y_train,cv=5)
#print(scores)
print(np.mean(scores))


model = pipe.fit(X_train,y_train)
predicted_label = model.predict(X_test)

print(predicted_label)

r2_score(y_train, predicted_label)



#gridsearchCV

#param_grid = {'lasso__alpha' : [0.001,0.01,.1,1,10,100]}
#grid = GridSearchCV(pipe,param_grid=param_grid,cv=5)






0.761619840507
[  67640.35264672  134171.50067279   93475.11657625 ...,    6551.79644088
   16283.14390789   12057.57472521]


ValueError: Found input variables with inconsistent numbers of samples: [11506, 3836]

In [581]:
np.shape(X_test)

(2558, 131)

In [536]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1)
imp = Imputer(missing_values='NaN',strategy='most_frequent',axis=0)

#print(imp.fit_transform(X_train))

X_train['uf17']





KeyError: 'uf17'

In [457]:
#[ 0.749631    0.76724338  0.76756874  0.7707777   0.7449333 ]

[ 0.74374985  0.76316138  0.76522073  0.76944962  0.74040628]
LassoCV.get_params().keys()



TypeError: get_params() missing 1 required positional argument: 'self'

In [368]:
X = raw_data[['uf1_19','uf1_20','uf1_21','uf1_22','sc23','sc24','sc36','sc37','sc38']]

#X.dtypes

for i in ['uf1_19','uf1_20','uf1_21','uf1_22','sc23','sc24','sc36','sc37','sc38']:
    #print(i,np.shape(X[i]))
    X[i] = X[i].astype('category')
    
X.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


uf1_19    category
uf1_20    category
uf1_21    category
uf1_22    category
sc23      category
sc24      category
sc36      category
sc37      category
sc38      category
dtype: object

In [394]:
#X[cont_added]
#X[cont_added].replace([99,98],np.nan)
data.uf1_1.value_counts()

9    10049
1      104
8       76
Name: uf1_7, dtype: int64

In [203]:
#simple linear model

# lr = LinearRegression().fit(X_train,y_train)

# #print (lr.coef_)
# #print (lr.intercept_)

# print(lr.score(X_train,y_train))
# print(lr.score(X_test,y_test))


#lasso regression

lasso = Lasso(alpha=.01).fit(X_train,y_train)

#print (lr.coef_)
#print (lr.intercept_)

print(lasso.score(X_train,y_train))
print(lasso.score(X_test,y_test))
print(np.sum(lasso.coef_ != 0))

mean  

0.237198026496
0.22222707103
26




In [207]:

print(raw_data.uf23.value_counts())


8     3262
7     3064
5     2180
6     2033
9     2017
1      990
10     595
3      544
4      345
2      312
Name: uf23, dtype: int64


In [216]:
print(raw_data.uf51_.value_counts())


AttributeError: 'DataFrame' object has no attribute 'uf51_'

In [211]:
raw_data.head()

Unnamed: 0,recid,boro,uf1_1,uf1_2,uf1_3,uf1_4,uf1_5,uf1_6,uf1_7,uf1_8,...,hflag4,hflag18,uf52h_h,uf52h_a,uf52h_b,uf52h_c,uf52h_d,uf52h_e,uf52h_f,uf52h_g
0,1,1,9,9,9,9,1,9,9,9,...,0,0,0,0,0,0,0,0,0,0
1,1,1,9,9,9,9,1,9,9,9,...,0,0,1,1,1,0,0,0,0,0
2,1,1,9,9,9,9,1,9,9,9,...,0,0,1,1,1,0,1,0,1,0
3,1,1,9,9,9,9,1,9,9,9,...,0,0,0,0,0,0,0,0,0,0
4,1,1,9,9,9,9,1,9,9,9,...,0,0,1,0,0,0,1,0,0,0


In [241]:
#print(raw_data.uf1_1.value_counts())
test1 = raw_data['uf1_1']

test1.head()

test1.replace(9,4)

columns_list = raw_data.columns
for i in columns_list:
    print(i)

recid
boro
uf1_1
uf1_2
uf1_3
uf1_4
uf1_5
uf1_6
uf1_7
uf1_8
uf1_9
uf1_10
uf1_11
uf1_12
uf1_13
uf1_14
uf1_15
uf1_16
uf1_35
uf1_17
uf1_18
uf1_19
uf1_20
uf1_21
uf1_22
sc23
sc24
sc36
sc37
sc38
hhr2
uf43
hhr5
race1
uf2a
uf2b
sc51
sc52
sc53
sc54
sc110
sc111
sc112
sc113
sc114
sc115
sc116
sc117
sc118
sc120
sc121
uf5
sc125
uf6
sc127
uf7
sc134
uf7a
uf9
sc140
sc141
uf8
sc143
sc144
uf10
uf48
sc147
uf11
sc149
sc173
sc171
sc150
sc151
sc152
sc153
sc154
sc155
sc156
sc157
sc158
sc159
uf12
sc161
uf13
uf14
sc164
uf15
sc166
uf16
sc174
uf64
uf17
sc181
sc541
sc184
sc542
sc543
sc544
uf17a
sc185
sc186
sc197
sc198
sc187
sc188
sc571
sc189
sc190
sc191
sc192
sc193
sc194
sc196
sc548
sc549
sc550
sc551
sc199
sc575
sc570
sc574
sc560
uf53
uf54
uf19
new_csr
rec15
sc26
uf23
rec21
sc27
rec1
uf46
rec4
rec_race_a
rec_race_c
rec62
rec64
rec54
rec53
tot_per
rec28
uf26
uf28
uf27
rec39
uf42
uf42a
uf34
uf34a
uf35
uf35a
uf36
uf36a
uf37
uf37a
uf38
uf38a
uf39
uf39a
uf40
uf40a
cd
uf30
uf29
rec8
rec7
fw
chufw
seqno
flg_sx1
flg_ag1
fl

1    5043
2    4555
3     434
8     197
Name: sc575, dtype: int64

Unnamed: 0,uf17,boro
3,1500,1
5,850,1
10,600,1
11,750,1
14,998,1
16,1650,1
17,975,1
18,750,1
19,1000,1
20,975,1


In [22]:
def score_rent():
	url = 'https://ndownloader.figshare.com/files/7586326'
	raw_data = pd.read_csv(url)

	#delete 9999 which is missing value (can't train if there is no response variable)
	raw_data = raw_data[raw_data.uf17 <= 99999]

	##FEATURE SELECTION

	# Categorical features

	#none are NaN
	cat_features = ['boro','cd']

	# 8 is nan
	cat_featured_added = ['uf1_1','uf1_2','uf1_3','uf1_4','uf1_5','uf1_6','uf1_7','uf1_8','uf1_9','uf1_10']
	cat_featured_test_10 = ['uf1_11','uf1_12','uf1_13','uf1_14','uf1_15','uf1_16','uf1_35','uf1_17','uf1_18']
	cat_featured_test_11 =['uf1_19','uf1_20','uf1_21','uf1_22','sc23','sc24','sc36','sc37','sc38']
	cat_featured_test_12 = ['sc147','sc171','sc154','sc157','sc197','sc198','sc188','sc190','sc191','sc192','sc193','sc194','sc575'] 

	#cat_simple = ['uf1_1','uf1_7','uf1_7','uf1_17']
	#cat_simple1 = ['sc23','sc24','sc36','sc37','sc38']

	#4 is nan
	cat_featured_added_1 = ['sc114']

	#no nan values
	cat_featured_added_2 = ['sc149','sc152','sc153','sc155','sc156','sc158']

	total_cat = cat_features + cat_featured_added_1 + cat_featured_added + cat_featured_test_10 + cat_featured_test_11 + cat_featured_test_12 

	#total_cat_simple = cat_simple + cat_simple1 + cat_featured_test_12 + cat_featured_added_1 + cat_featured_added_2

	#Continous

	cont_features = ['sc150','sc151']

	#98,99 nan, 13k of those
	#monthly fees, real estate taxes (13.7K nan), stories in building
	cont_added = ['uf11','uf23']

	#9999 is nan (mostly monthly fees)
	#sc186 9 is no breakdowns, change to 0

	cont_added_1 = ['uf12','uf13','uf14','uf15','sc186']



	# 8 is not reported
	cont_added_2 = ['sc189','sc196','sc199']


	#differences
	#sc143,sc144,sc154

	total_cont = cont_features + cont_added + cont_added_1 + cont_added_2 


	## REPLACE INTEGERS WITH NaN's

	X = raw_data[total_cat + total_cont]

	#cont
	X[cont_added] = X[cont_added].replace([99,98],np.nan)
	X[cont_added_1] = X[cont_added_1].replace([9999,99999],np.nan)
	X[cont_added_2 ] = X[cont_added_2 ].replace(8,np.nan)


	#cat
	X[cat_featured_added] = X[cat_featured_added].replace(8,np.nan)
	X[cat_featured_test_10] = X[cat_featured_test_10].replace(8,np.nan)
	X[cat_featured_test_11] = X[cat_featured_test_11].replace(8,np.nan)
	X[cat_featured_test_12] = X[cat_featured_test_12].replace(8,np.nan)
	X[cat_featured_added_1] = X[cat_featured_added_1].replace(8,np.nan)


	## ONE HOT ENCODING warmup (imputation needs to happen before this)

	#turn categorical data into dummies
	for i in total_cat:
		X[i] = X[i].astype('category')

	#IMPUTATION (REPLACING NAN'S)

	# imp = Imputer(missing_values='NaN',strategy='most_frequent',axis=0)
	# imp.fit(X_train)

	# # #LASSO MODEL

	# lasso = Lasso(alpha=0.001).fit(X_train,y_train)
	# print(lasso.score(X_train,y_train))
	# print (lasso.score(X_test,y_test))
	# print(np.sum(lasso.coef_ != 0))

	#print (X_dummies.head())

	#scores = cross_val_score(lasso,X_dummies,y,cv=5)
	#print(scores)

	# #TRAIN TEST SPLIT

	#train test split
	y=raw_data['uf17']

	X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1)

	X_train = pd.get_dummies(X_train) 
	X_test = pd.get_dummies(X_test) 

	#feature selection

	select_lassocv = SelectFromModel(LassoCV(),threshold = 'median')

	#pipeline

	pipe = make_pipeline(Imputer(missing_values='NaN',strategy='most_frequent'),MaxAbsScaler(),select_lassocv,Lasso(alpha=.001))
	scores = cross_val_score(pipe,X_train,y_train,cv=5)
	
	print(scores)
	print(np.mean(scores))


	model = pipe.fit(X_train,y_train)
	predicted_label = model.predict(X_test)

	print(predicted_label)
    
	print (r2_score(y_test, predicted_label))

	#gridsearchCV
	print('gridsearch')
	param_grid = {'lasso__alpha' : np.array([0.001,0.01,.1,1,10,100])}
	grid = GridSearchCV(pipe,param_grid,cv=5)
	grid.fit(X_train,y_train)
	grid.predict(X_test)
    print(grid.scores)
    print(np.mean(grid.scores))
	print(grid.best_params_)
        

score_rent()










IndentationError: unindent does not match any outer indentation level (<ipython-input-22-6aee37330bca>, line 136)

In [24]:
print ("test{},test{}".format(1,2,3))

test1,test2
