# Analyze the Data

#### 1 - Load Clean Building Data
#### 2 - Analyze
####   


### Import Modules and Data


In [1]:
import os
import shutil
import rasterio
from tqdm import tqdm_notebook as tqdm
import matplotlib.cm as cm
import random
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')
from mpl_toolkits.mplot3d import axes3d

In [2]:
# Set Main Directories
project_folder = '../'
data_folder = project_folder + '1_data/'
gif_folder = data_folder + 'gifgraph/'

In [None]:
# Start the clock
start_time = time.time()

### Get Data - CSV Verion

In [3]:
bldngs_grps_initial = pd.read_csv(data_folder + "bldng_groups.csv")
bldngs_grps_initial.drop(columns=['Unnamed: 0'], inplace=True)
bldngs = pd.read_csv(data_folder + "bldngs.csv")
bldngs.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
# Regroup
for place in ['lat','long']:
    bldngs['group_' + str(place)] = bldngs[str(place) + 'itude']\
                            .apply(lambda x: round(x / 0.003) * 0.003)
bldngs['count'] = 1
bldngs_grps = bldngs.drop(columns=['latitude','longitude'])\
                          .groupby(['group_long', 'group_lat'])\
                            .sum()\
                            .sort_values(by='count', ascending=True)\
                            .reset_index()

In [None]:
print "Total Buildings: " + "{:,}".format(len(bldngs))
print "Total Groups: " + "{:,}".format(len(bldngs_grps))
print "Group Reduction from HectoMeter: " \
        + "{:,}".format(len(bldngs_grps_initial)-len(bldngs_grps))

In [None]:
# specify the vars of interest
analyvars = ['count', 'area', 'light', 'bldngval_light'\
             ,'pop', 'bldngval_pop']

In [None]:
if False:
    for df in [bldngs_grps]:
        for var in ['count', 'bldngval_pop', 'bldngval_light']:
            df[var + '_norm'] = (df[var] - df[var].min())/ \
            (df[var].max() - df[var].min())
            df[var + '_norm_z'] = (df[var] - df[var].mean())/ \
            (df[var].std())
            df[var + '_sqrd'] = (df[var]**2)
            df[var + '_log'] = np.log(df[var])*(-1)

In [None]:
try:
    df = bldngs_grps.groupby(bldngs_grps['count'])\
                    .mean().reset_index()
    grouped = df[analyvars]
except:
    df = bldngs_grps.groupby(bldngs_grps['count'])\
                    .mean().reset_index()
    grouped = df[analyvars]

In [None]:
sns.set(style = 'dark')
f, axes = plt.subplots(2, 2, figsize = (8,5), sharex = True)
plt.subplots_adjust(bottom = 0.1, top = 0.9, left = 0.125\
                    , right = 0.9, hspace = 0.2, wspace = .3)
# for data in [grouped, bldngs_grps]:
for y,axis in zip(['light','bldngval_light', 'pop', 'bldngval_light']\
                  , [(0,0),(0,1),(1,0),(1,1)]):
    sns.scatterplot(y = y, x = 'count', data = bldngs_grps\
                    , ax = axes[axis])

In [None]:
corr = bldngs_grps[analyvars].corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            annot=True, cmap = "YlGnBu")

In [None]:
f, axes = plt.subplots(2, 2, figsize = (8,5), sharex = True)
plt.subplots_adjust(bottom = 0.1, top = 0.9, left = 0.125\
                    , right = 0.9, hspace = 0.2, wspace = .3)
for y,axis in zip(['light','bldngval_light', 'pop', 'bldngval_pop']\
                  , [(0,0),(0,1),(1,0),(1,1)]):
    sns.scatterplot(y = y, x = 'count', data = grouped\
                    , ax = axes[axis])

In [None]:
corr = grouped.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            annot=True, cmap = "YlGnBu")

In [None]:
# state_groups = bldngs.groupby(bldngs['state']).sum()
cluster_groups = bldngs.groupby(bldngs['cluster_k5']).sum()
total = bldngs.sum()

In [None]:
(cluster_groups[['count','area','bldngval_light','bldngval_pop']]\
         /1000000)\
        .astype(float)

In [None]:
(total[['count','area','bldngval_light','bldngval_pop']]/1000000)\
        .astype(float).reset_index()

In [None]:
stop_time = time.time()
print 'This notebook took ' \
        + str(int((stop_time-start_time)/60/60)) + ' Hours ' \
        + str(int((stop_time-start_time)/60)) + ' Minutes ' \
        + 'and ' + str(int(stop_time-start_time)%60) + ' Seconds'