In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 50)

In [2]:
train = pd.read_csv('training.csv')
test = pd.read_csv('validation.csv')

In [3]:
prop = pd.concat([train, test], axis=0)

In [3]:
undicided = ['zip_code', 'street_name', 'street_number']
f_to_split = ['damage_code', 'misc_features']
f_to_encode = ['zone', 'sub_type', 'city_name', 'area_type', 'inspection_type',
               'structural_quality_grade', 'exterior_condition_grade', 'interior_condition_grade',
               'utilities_grade', 'damage_and_issue_grade',
               'exterior_color', 'exterior_material']
f_num = ['days_on_market', 'current_population', 'population_5_years_ago', 'schools_in_area',
         'public_transit_score', 'crime_score', 'culture_score', 'average_neighborhood_price', 'overall_inspector_score',
         'sqft', 'floors_in_building', 'floors_in_unit', 'floor_of_unit', 'bedrooms', 'bathrooms', 'parking', 'basement',
         'central_hvac']
f_date = ['build_year', 'remodel_year']

In [7]:
prop.build_date = prop.build_date.str.replace('(\d*?)-02-29', '\\1-02-28')
prop.remodel_date = prop.remodel_date.str.replace('(\d*?)-02-29', '\\1-02-28')
prop.build_date = pd.to_datetime(prop.build_date, format='%Y-%m-%d')
prop.remodel_date = pd.to_datetime(prop.remodel_date, format='%Y-%m-%d')
prop['build_year'] = prop.build_date.dt.year
prop['remodel_year'] = prop.build_date.dt.year

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
misc_cv = CountVectorizer()
damage_cv = CountVectorizer()
misc_dummy = misc_cv.fit_transform(prop.misc_features.str.replace('|', ' ').fillna('')).toarray()
damage_dummy = damage_cv.fit_transform(prop.damage_code.str.replace('|', ' ').fillna('')).toarray()

In [11]:
train_X = prop.copy()

In [12]:
for c in f_to_encode:
    train_X[c] = train_X[c].factorize()[0]

In [13]:
X = np.hstack([train_X[f_to_encode + f_num + f_date + ['initial_price']].fillna(-1).values,
               misc_dummy, damage_dummy, misc_dummy.sum(axis=1).reshape(-1, 1), damage_dummy.sum(axis=1).reshape(-1, 1)])
y = train_X.investment.values

In [14]:
final_return = prop.final_price - prop.investment - prop.initial_price

In [15]:
y = (final_return / (prop.initial_price + prop.investment)).values

In [23]:
X_tr, X_te, y_tr = X[:20000], X[20000:], y[:20000]

In [1]:
from lightgbm import LGBMRegressor

In [2]:
lgb = LGBMRegressor(n_estimators=5000, learning_rate=0.01, num_leaves=40, min_child_samples=50)

In [25]:
lgb = LGBMRegressor(n_estimators=5000, learning_rate=0.01, num_leaves=40, min_child_samples=50)
pred = lgb.fit(X_tr, y_tr, categorical_feature=range(len(f_to_encode))).predict(X_te)

In [28]:
test['pred'] = pred

In [35]:
test = test.sort_values('pred', ascending=False)
test['purchase_decision'] = 1
test['investment'] = train['investment'].mean()

In [36]:
test

Unnamed: 0,property_id,zone,sub_type,street_name,street_number,address_line_2,city_name,zip_code,days_on_market,build_date,remodel_date,area_type,current_population,population_5_years_ago,schools_in_area,public_transit_score,crime_score,culture_score,average_neighborhood_price,damage_code,inspection_type,structural_quality_grade,exterior_condition_grade,interior_condition_grade,utilities_grade,damage_and_issue_grade,overall_inspector_score,sqft,floors_in_building,floors_in_unit,floor_of_unit,bedrooms,bathrooms,parking,basement,central_hvac,misc_features,exterior_color,exterior_material,initial_price,initial_value,purchase_decision,investment,final_price,pred
220,2361676,commercial,retail,Hernandez Freeway,295,,Leonardport,59720,132,1971-02-05,2013-10-16,urban,90100,67500,12.0,7.0,3.25,5.47,489000.0,,buyer,C,C,D,C,D,52.0,1470.00,1.0,1.0,,,,1,0.0,1,,beige,concrete_poured,450000.0,,1,44586.0195,,0.923979
4222,4116321,commercial,restaurant,Eric Plain,8413,,South Arthur,94182,166,2013-07-21,,urban,92900,72300,12.0,6.0,3.89,7.48,,,seller,C,B,D,F,C,59.0,1249.00,1.0,1.0,,,,1,0.0,1,,white,wood,423000.0,,1,44586.0195,,0.848604
4006,8082140,commercial,restaurant,Murphy Road,7972,,West Austin,28576,70,1948-03-10,1986-03-13,urban,86900,71700,18.0,4.0,4.49,7.96,442000.0,,buyer,B,E,C,B,C,72.0,1334.00,1.0,2.0,0.0,,,0,0.0,0,,brown,concrete_poured,460000.0,,1,44586.0195,,0.843595
2476,3926440,commercial,office space,Brittany Drive,4274,,Leonardport,59720,127,2008-07-14,2015-11-20,urban,90100,67500,13.0,7.0,2.19,6.76,,,seller,D,D,E,B,B,31.0,3433.00,1.0,1.0,,,,1,0.0,1,,red,stone,545000.0,,1,44586.0195,,0.821108
1599,7804915,mixed-use,condo,Estrada Springs,9378,,Leonardport,59720,151,1953-12-22,1987-10-13,urban,90100,67500,10.0,6.0,2.12,8.65,,,seller,F,C,D,B,C,46.0,874.00,31.0,1.0,16.0,2.0,2.0,0,0.0,0,,brown,brick,547000.0,,1,44586.0195,,0.791515
3840,7748019,residential,condo,Romero Port,413,,West Williamburgh,27724,76,1998-03-24,2009-11-20,urban,77700,60300,8.0,9.0,3.69,5.63,210000.0,,buyer,C,D,C,E,D,43.0,1491.00,31.0,1.0,27.0,2.0,4.5,0,0.0,1,,grey,concrete_poured,240000.0,,1,44586.0195,,0.775616
3913,5008296,residential,condo,Cory Lakes,686,,West Austin,28576,35,2012-05-06,,urban,86900,71700,9.0,5.0,4.44,5.97,222000.0,flood,buyer,C,C,C,D,E,44.0,999.00,39.0,1.0,11.0,1.0,2.5,0,0.0,1,alarm|fireplace,beige,concrete_poured,187000.0,,1,44586.0195,,0.766422
2136,8973181,mixed-use,condo,Morrison Roads,5138,,Leonardport,59720,255,1994-04-21,,urban,90100,67500,13.0,7.0,3.99,8.86,431000.0,,foreclosure,B,A,C,E,C,71.0,1432.00,19.0,1.0,18.0,2.0,2.0,1,0.0,1,,grey,other,521000.0,,1,44586.0195,,0.756429
3354,9364413,residential,condo,Karen Crest,111,,North April,45566,47,2012-06-27,,urban,80700,63500,10.0,9.0,3.86,4.50,222000.0,,buyer,E,C,A,B,D,55.0,1423.00,26.0,1.0,13.0,3.0,3.5,0,0.0,0,,yellow,steel,235000.0,,1,44586.0195,,0.735683
4032,4679120,residential,condo,Cross Junction,7922,,South Arthur,94182,38,1938-10-20,1998-12-22,urban,92900,72300,8.0,6.0,4.81,5.94,235000.0,,seller,D,B,B,F,E,74.0,1441.00,21.0,1.0,14.0,3.0,2.5,0,0.0,0,,brown,wood,286000.0,,1,44586.0195,,0.730712


In [40]:
test[['property_id', 'purchase_decision', 'investment']].to_csv('team8.csv', index=False)

In [41]:
lgb.feature_importances_

array([ 4139,  3803, 10111,   654,   984,  3888,  3115,  2697,  3112,
        2926,  4434,  4144, 12635, 13222, 13447,  6051,  3626, 12924,
       13535,  4812,  9305, 12900,  3464,   540,  2342,  1508,  3827,
        1382,   132,  1320, 13269,     0, 16412,   150,     7,    53,
          15,    95,    46,     0,   157,   143,   207,   378,   426,
         309,   171,   123,    32,   162,    42,   159,   206,   385,  1074])