In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from dataset_features import numeric_features, date_features

In [None]:
numeric_fname = "input/train_numeric.csv"
date_fname = "input/train_date.csv"

In [None]:
station_id = 'L0S01' # 'L0S00' 'L0S01' 'L0S09'
fname = "data/" + station_id + '_date.csv'        
features = ['Id'] + date_features[station_id]

station_df = pd.read_csv(fname, usecols=features, index_col=['Id'])

In [None]:
station_df.head()

In [None]:
station_df.mean()

In [None]:
station_df.info()

In [None]:
station_df['L0_S1_D26'].isnull().any()

In [None]:
def date_df_missing_values_summary():
    for station_id in sorted(date_features):
        print station_id,
        fname = "data/" + station_id + '_date.csv'        
        features = ['Id'] + date_features[station_id]
        station_df = pd.read_csv(fname, usecols=features, index_col=['Id'])

        for feature in station_df.columns:
            if station_df[feature].isnull().any():
                print feature,
        print


In [None]:
%%time
date_df_missing_values_summary()

In [None]:
station_df

# Update Missing Values for Date Features

In [None]:
row_means = station_df.mean(axis=1)  
for i, col in enumerate(station_df):
     station_df[col] = station_df[col].fillna(row_means)


In [None]:
station_df[station_df['L0_S1_D26'].isnull()]

In [None]:
def date_df_missing_values_update():
    for station_id in sorted(date_features):
        print station_id,
        fname = "data/" + station_id + '_date.csv'        
        features = ['Id'] + date_features[station_id]
        station_df = pd.read_csv(fname, usecols=features, index_col=['Id'])

        save_required = False
        row_means = station_df.mean(axis=1)  
        for feature in station_df.columns:
            if station_df[feature].isnull().any():
                save_required = True
                station_df[feature] = station_df[feature].fillna(row_means)
                print '.',
        if save_required:
            fname = "data/" + station_id + '_date_fixed.csv'        
            station_df.to_csv(fname)
            print "Updated",
        print


In [None]:
%%time
date_df_missing_values_update()

In [None]:
import os
def date_file_name(station_id):
    ret = "data/" + station_id + '_date_fixed.csv'
    if not os.path.exists(ret):
        ret = "data/" + station_id + '_date.csv'
    return ret

# Timeline

In [None]:
column_names = sorted(date_features)
timeline_df = pd.DataFrame(index=range(1,2367495+1), columns=column_names, dtype=np.float32)
timeline_df.index.names = ['Id']
timeline_df.shape

In [None]:
timeline_df

In [None]:
for index, station_id in enumerate(sorted(date_features)):
    print station_id,
    fname = date_file_name(station_id)
    station_df = pd.read_csv(fname, index_col=['Id'])

    last_feature_in_list = station_df.columns[-1] 

    station_df.rename(columns={last_feature_in_list: station_id}, inplace=True)

    timeline_df.loc[station_df.index, station_id] = station_df[station_id]


In [None]:
timeline_df = timeline_df.dropna(how='all')


In [None]:
timeline_df['Minimum'] = timeline_df.min(axis=1)
timeline_df['Maximum'] = timeline_df.max(axis=1)
timeline_df['Total_Duration'] = timeline_df['Maximum'] - timeline_df['Minimum']

In [None]:
timeline_df[['Minimum', 'Maximum', 'Total_Duration']]

In [None]:
timeline_df.to_csv("data/timeline.csv", index=True)

## Sorting

In [None]:
timeline_df.sort_values(['Minimum'], inplace=True, kind='mergesort', na_position='last')


In [None]:
timeline_df

## Timeline Visualization

In [None]:
plt.figure(figsize=(10,8))
histogram_bins = plt.hist(timeline_df['Minimum'], bins=250)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(timeline_df['Minimum'], bins=250)

In [None]:
yy=histogram_bins[0]
xx=histogram_bins[1]
print type(yy)
print len(yy), len(xx[:-1])

data = zip(xx[:-1], yy)
data_df = pd.DataFrame(data=data, columns=['A', 'B'])
print data_df[data_df['B'] < 10]

# index = data_df['A'] < 845
# data_df[index].plot(kind='bar')

# index = 844 < data_df['A']
# data_df[index].plot(kind='bar')
