In [25]:
'''
Author: Sriram Yarlagadda + George Nakhleh
Date Created: 10/15/2016
Note: Please use Python 2.7
'''
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
os.chdir('C:\Users\syarlag1\Desktop\Loyola-SAH-Seasonality-Analysis')
%matplotlib inline

### Reading Data

In [26]:
os.chdir('./data')
dataFiles = os.listdir('.')
allDataDict = {}
# list of variables we need
varList = ['SCORE','SPRING','SUMMER','WINTER','FALL','FEMALE']
for dataFile in dataFiles:
    state = dataFile[-6:-4]
    allDataDict[state] = pd.read_stata(dataFile, columns=varList)

In [27]:
for key in allDataDict.keys(): #creating a new state identifier column
    allDataDict[key]['State'] = pd.Series([str(key)[:2]]*allDataDict[key].shape[0])
allStateData = pd.concat(allDataDict.values()) #combing into a single dataframe

In [46]:
allStateData.index = range(allStateData.shape[0])

### Data Preprocessing

In [None]:
allStateData.shape

In [29]:
allStateData.columns

Index([u'SCORE', u'SPRING', u'SUMMER', u'WINTER', u'FALL', u'FEMALE',
       u'State'],
      dtype='object')

In [30]:
np.sum(np.array(allStateData < 0), 0) # 88 values < 0 (invalid)

array([ 0,  0,  0,  0,  0, 88,  0])

In [31]:
np.sum(np.array(pd.isnull(allStateData)),0) # 226 NaNs

array([  0,   0,   0,   0,   0, 226,   0])

In [32]:
for feature in allStateData.columns: 
    allStateData = allStateData[allStateData[feature] >= 0]

In [34]:
# removing instances where all months are equal to 0
allStateData = allStateData.drop(allStateData[allStateData['FALL']+\
                                                  allStateData['WINTER']+\
                                                  allStateData['SUMMER']+\
                                                  allStateData['SPRING']== 0].index)

In [23]:
allStateData.shape

(29182, 7)

In [50]:
# creating a new "season" variable
season = []
for iRow in range(allStateData.shape[0]):
    if allStateData['FALL'][iRow] == 1: season.append('FALL')
    if allStateData['SPRING'][iRow] == 1: season.append('SPRING')
    if allStateData['SUMMER'][iRow] == 1: season.append('SUMMER')
    if allStateData['WINTER'][iRow] == 1: season.append('WINTER')

allStateData['SEASON'] = pd.Series(season)

In [51]:
# saving an excel copy of the entire data for later use
os.chdir('./..')
allStateData.to_csv('allData.csv') # note that all data only includes the column subsets we need

In [52]:
allStateData['SEASON'] 

0        SUMMER
1        WINTER
2        WINTER
3          FALL
4        SUMMER
5        WINTER
6        SPRING
7        SPRING
8          FALL
9        SUMMER
10       SPRING
11         FALL
12       SUMMER
13       WINTER
14         FALL
15       SUMMER
16       WINTER
17       SPRING
18       SUMMER
19       WINTER
20         FALL
21       WINTER
22       SPRING
23       WINTER
24       SPRING
25       SPRING
26         FALL
27         FALL
28       WINTER
29       SUMMER
          ...  
29152    SUMMER
29153    SUMMER
29154    SUMMER
29155    SUMMER
29156    SUMMER
29157    SUMMER
29158    SUMMER
29159      FALL
29160      FALL
29161      FALL
29162      FALL
29163      FALL
29164      FALL
29165      FALL
29166      FALL
29167      FALL
29168      FALL
29169    WINTER
29170      FALL
29171    WINTER
29172    SUMMER
29173    WINTER
29174    SPRING
29175      FALL
29176    SPRING
29177    SPRING
29178    SPRING
29179    SUMMER
29180    WINTER
29181    WINTER
Name: SEASON, dtype: obj

### Seasonality

#### *Question*: Is there seasonality in the SAH Scores?

Let us first perform some quick data exploration