In [1]:
import pandas as pd
import calendar 
import datetime 
import calendar 

In [2]:
data_input_file = 'Data Challenge/California_SO2_Measures.csv'
data_output_folder = 'data/'
CONT_VAR = ['Daily Max 1-hour SO2 Concentration' ,'DAILY_AQI_VALUE' ,'DAILY_OBS_COUNT' ,'PERCENT_COMPLETE']

In [3]:
class DataPreProcessing:
    '''
    No Nan Values found : For Nan values fill in missing using average of some distriibution
    '''
    def __init__(self):
        self.df = pd.read_csv(data_input_file)
        self.df = self.drop_no_info_variables()
        self.index_list = self.index_list()
        self.all_df = self.create_independent_dataframes()
        self.df = self.create_df()
        self.df = self.changeDateToDay()
        self.df = self.create_categorical_variables()
        self.mean_normalize_cont_var()
        
        
    def findDay(self, date): 
        '''
        Converts Date to Day : Day conveys information about pollution level (Weekdays pollution should be high)
        '''
        born = datetime.datetime.strptime(date, '%m %d %Y').weekday() 
        return (calendar.day_name[born]) 
    
    def changeDateToDay(self):
        counter = 0
        for i in self.df['Date']:
            self.df['Date'][counter] = self.findDay(' '.join(i.split('/')))
            counter = counter + 1
        return self.df
    
    
    def printdf(self):
        print(self.df.describe())
        print(self.df)
    
    def drop_no_info_variables(self):
        '''
        drop variables that will not be used for computations
        '''
        self.df.drop(self.df.iloc[:, [1,2,3,5,7,10,11,12,13]], inplace = True, axis = 1) 
        return self.df
    
    
    def create_independent_dataframes(self):
        '''
        creates independent dataframes for all counties
        '''
        all_df = []
        for i in range(len(self.index_list)-1):
            temp_df = self.df[self.index_list[i]:self.index_list[i+1]]
            temp_df.reset_index(drop=True, inplace=True) 
            all_df.append(temp_df)
        return all_df
    
    
    def index_list(self):
        '''
        Gets index for start of data for each county
        '''
        index_list = []
        for index in range(len(self.df)):
            if self.df['Date'][index] == '01/01/2019':
                index_list.append(index)
        return index_list
        
    
    
    def create_df(self):
        d = { 'Date' :[], 'Daily Max 1-hour SO2 Concentration':[] ,'DAILY_AQI_VALUE':[] ,'DAILY_OBS_COUNT' :[],
         'PERCENT_COMPLETE':[]}
        
        for i in range(0,292):
            temp_df = self.df[self.df['Date'] == self.df['Date'][i]]
            temp_df = temp_df.mean(axis=0)    
            d['Date'].append(self.df['Date'][i])
            d['Daily Max 1-hour SO2 Concentration'].append(temp_df['Daily Max 1-hour SO2 Concentration'])
            d['DAILY_AQI_VALUE'].append(temp_df['DAILY_AQI_VALUE'])
            d['DAILY_OBS_COUNT'].append(temp_df['DAILY_OBS_COUNT'])
            d['PERCENT_COMPLETE'].append(temp_df['PERCENT_COMPLETE'])
        df = pd.DataFrame(data=d)
        return df
    
    
    def save_file(self):
        self.df.to_csv('data.csv',index=False)
    
    def mean_normalize_cont_var(self):
        self.df[CONT_VAR] = self.df[CONT_VAR].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
        
    
    def create_categorical_variables(self):
        df = pd.get_dummies(self.df,columns=['Date'])
        return df

In [4]:
obj = DataPreProcessing()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
obj.printdf()

       Daily Max 1-hour SO2 Concentration  DAILY_AQI_VALUE  DAILY_OBS_COUNT  \
count                          292.000000       292.000000       292.000000   
mean                             0.031441         0.058367         0.750682   
std                              0.068036         0.083801         0.121386   
min                              0.000000         0.000000         0.000000   
25%                              0.013306         0.023148         0.680000   
50%                              0.021814         0.043554         0.760000   
75%                              0.031620         0.064485         0.844800   
max                              1.000000         1.000000         1.000000   

       PERCENT_COMPLETE  Date_Friday  Date_Monday  Date_Saturday  Date_Sunday  \
count        292.000000   292.000000   292.000000     292.000000   292.000000   
mean           0.756833     0.143836     0.140411       0.143836     0.140411   
std            0.120184     0.351525     0.34