#### Imputing & Adding Newly Created Variables to Testing Data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))
print(os.listdir("../input/rossmann-store-sales"))

In [None]:
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
training = pd.read_csv("../input/exploratory-analysis/training.csv", dtype = types)
test = pd.read_csv("../input/rossmann-store-sales/test.csv",parse_dates=[3], dtype = types)
train = pd.read_csv("../input/rossmann-store-sales/train.csv", parse_dates=[2], dtype=types)
store = pd.read_csv("../input/rossmann-store-sales/store.csv")

#### Filling in Missing Values

In [52]:
import numpy
from sklearn.base import TransformerMixin

#filling in missing data
test.fillna(test.median(), inplace=True)
print(test.isnull().sum())

# fill missing num values with median column values
store.fillna(store.median(), inplace=True)

# class to impute categorical variables
class SeriesImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        If the Series is of dtype Object, then impute with the most frequent object.
    
        """
    def fit(self, X, y=None):
        if   X.dtype == numpy.dtype('O'): self.fill = X.value_counts().index[0]
        return self

    def transform(self, X, y=None):
       return X.fillna(self.fill)

a  = SeriesImputer()   # Initialize the imputer
a.fit(store['PromoInterval'])  # Fit the imputer
store['PromoInterval'] = a.transform(store['PromoInterval'])  

testing = pd.merge(test, store, on='Store')


Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,1,4,2015-09-17,1.0,1,0,0.0,c,a,1270.0,9.0,2008.0,0,22.0,2012.0,"Jan,Apr,Jul,Oct"
1,857,1,3,2015-09-16,1.0,1,0,0.0,c,a,1270.0,9.0,2008.0,0,22.0,2012.0,"Jan,Apr,Jul,Oct"
2,1713,1,2,2015-09-15,1.0,1,0,0.0,c,a,1270.0,9.0,2008.0,0,22.0,2012.0,"Jan,Apr,Jul,Oct"
3,2569,1,1,2015-09-14,1.0,1,0,0.0,c,a,1270.0,9.0,2008.0,0,22.0,2012.0,"Jan,Apr,Jul,Oct"
4,3425,1,7,2015-09-13,0.0,0,0,0.0,c,a,1270.0,9.0,2008.0,0,22.0,2012.0,"Jan,Apr,Jul,Oct"
5,4281,1,6,2015-09-12,1.0,0,0,0.0,c,a,1270.0,9.0,2008.0,0,22.0,2012.0,"Jan,Apr,Jul,Oct"
6,5137,1,5,2015-09-11,1.0,0,0,0.0,c,a,1270.0,9.0,2008.0,0,22.0,2012.0,"Jan,Apr,Jul,Oct"
7,5993,1,4,2015-09-10,1.0,0,0,0.0,c,a,1270.0,9.0,2008.0,0,22.0,2012.0,"Jan,Apr,Jul,Oct"
8,6849,1,3,2015-09-09,1.0,0,0,0.0,c,a,1270.0,9.0,2008.0,0,22.0,2012.0,"Jan,Apr,Jul,Oct"
9,7705,1,2,2015-09-08,1.0,0,0,0.0,c,a,1270.0,9.0,2008.0,0,22.0,2012.0,"Jan,Apr,Jul,Oct"


#### Adding Newly Created Variables

In [58]:
# dates breakdown
testing['Date'] = pd.to_datetime(testing['Date'])
testing['Day_of_Week'] = testing['Date'].dt.day_name()

testing['Date'] = pd.to_datetime(testing['Date'])
testing['Month'] = testing['Date'].dt.strftime('%b')

# competition dates
testing["CompOpenDate"] = testing["CompetitionOpenSinceYear"].astype(str) + testing["CompetitionOpenSinceMonth"].astype(str) + '1'
testing['CompOpenDate'] = pd.to_datetime(testing['CompOpenDate'])

def check_Comp(row):
    if row["Date"] >= row["CompOpenDate"]:
        return "1"
    else:
        return "0"
    
def num_days_comp(row):
    return row['Date'] - row['CompOpenDate']

testing = testing.assign(daysSinceNewComp=testing.apply(num_days_comp, axis=1))
testing['daysSinceNewComp'] = testing['daysSinceNewComp'].dt.days
testing = testing.assign(isCompOpen=testing.apply(check_Comp, axis=1))

# Competition Distance
def comp_dist(row):
    if row["CompetitionDistance"] <= 710: # min to first quartile
        return "bin1"
    elif row["CompetitionDistance"] > 710 and row["CompetitionDistance"] <= 2325 : # first quartile to second quartile
        return "bin2"
    elif row["CompetitionDistance"] > 2325 and row["CompetitionDistance"] <= 6880 : # second quartile to third quartile
        return "bin3"
    elif row["CompetitionDistance"] > 6880 and row["CompetitionDistance"] <= 75860 : # third quartile to max
        return "bin4"

testing = testing.assign(compDistanceBin=testing.apply(comp_dist, axis=1))



(1017209, 24)

In [59]:
training.shape

(1017209, 24)

In [60]:
testing.shape

(41088, 23)

In [61]:
testing.to_csv("testing.csv", index=False)

As expected, the # of columns match, except by one, since the testing data won't include # of customers.