# CPI data cleaning

In [2]:
import pandas as pd

In [5]:
cpi = pd.read_csv('../CPI_Data/Monthly_CPI_data_w_percent_change_apr.csv', index_col = 0)
cpi

Unnamed: 0,Food,Food at home,Cereals and bakery products,Cereals and cereal products,Flour and prepared flour mixes,Breakfast cereal,"Rice, pasta, cornmeal",Bakery products,Bread,"Fresh biscuits, rolls, muffins",...,"Other pork including roasts, steaks, and ribs",Other uncooked poultry including turkey,Photographers and photo processing,Sugar and sugar substitutes,"Men's underwear, nightwear, swimwear, and accessories","Women's underwear, nightwear, swimwear, and accessories","Computers, peripherals, and smart home assistant devices","Computers, peripherals, and smart home assistants",Day care and preschool,Residential telephone services
2012-01,232.666,231.694,266.677,234.159,252.159,228.664,242.992,283.880,170.658,167.911,...,,,,,,,,,,
2012-02,232.486,231.180,267.821,233.362,250.564,227.984,242.217,286.484,172.978,168.364,...,,,,,,,,,,
2012-03,232.792,231.383,267.101,232.660,252.104,227.997,238.975,285.771,172.955,168.406,...,,,,,,,,,,
2012-04,233.234,231.711,268.014,233.662,252.102,228.862,240.693,286.589,173.559,166.211,...,,,,,,,,,,
2012-05,233.339,231.518,268.653,235.054,254.336,230.326,241.615,286.629,173.581,167.158,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Monthly Percent Change 2021,0.500,0.400,0.500,1.200,0.700,1.400,1.100,0.100,0.000,-1.600,...,-4.500,-0.900,-1.500,0.000,-2.200,-0.900,,-1.000,0.100,
2022-01,289.772,270.711,302.242,246.536,264.478,238.165,255.463,334.550,202.802,197.864,...,159.208,173.654,125.879,217.259,170.644,103.677,,39.651,323.184,140.232
2022-02,292.794,274.568,306.193,249.590,266.133,240.812,259.560,339.033,203.586,202.667,...,160.293,175.812,126.290,219.558,174.732,108.355,,39.958,324.504,140.970
2022-03,295.728,278.612,311.606,255.990,273.106,246.618,266.558,343.758,205.262,207.902,...,161.906,178.389,127.398,223.019,175.267,107.575,,40.302,324.205,140.640


In [6]:
# checking sums of na values
cpi.isna().sum()

Food                                                          0
Food at home                                                  0
Cereals and bakery products                                   0
Cereals and cereal products                                   0
Flour and prepared flour mixes                                0
                                                           ... 
Women's underwear, nightwear, swimwear, and accessories      98
Computers, peripherals, and smart home assistant devices    130
Computers, peripherals, and smart home assistants           112
Day care and preschool                                      112
Residential telephone services                              140
Length: 328, dtype: int64

In [7]:
#columns that have null values in it
cpi.columns[cpi.isna().sum() > 0]

Index(['Other pork including roasts and picnics',
       'Other poultry including turkey', 'Sugar and artificial sweeteners',
       'Food at employee sites and schools', 'Men's furnishings',
       'Women's underwear, nightwear, sportswear and accessories',
       'Audio discs, tapes and other media',
       'Personal computers and peripheral equipment', 'Household operations',
       'Domestic services', 'Gardening and lawncare services',
       'Repair of household items', 'Care of invalids and elderly at home',
       'Leased cars and trucks', 'Airline fare',
       'Cable and satellite television and radio service',
       'Video discs and other media, including rental of video and audio',
       'Photographers and film processing',
       'Club dues and fees for participant sports and group exercises',
       'Fees for lessons or instructions', 'Child care and nursery school',
       'Land-line telephone services',
       'Recorded music and music subscriptions', 'Airline fares',

These expenditure categories have missing values. We need to determine if we can fix these null vaues by getting the correct data and inputting it in or to not consider these for our analysis.

Let's look into a column that has missing values and explore its data.

When looking at some columns, such as 'Airline Fare'. There are actually some gramatical differences in the category names. All of these cases have to be handled for our data to be accurate.

In [8]:
cpi['Airline fare'].isna()

2012-01                        False
2012-02                        False
2012-03                        False
2012-04                        False
2012-05                        False
                               ...  
Monthly Percent Change 2021     True
2022-01                         True
2022-02                         True
2022-03                         True
2022-04                         True
Name: Airline fare, Length: 144, dtype: bool

In [9]:
#there are actually different column names for same variable

for i in cpi.columns:
    if "Airline" in i:
        print(i)

Airline fare
Airline fares


In [10]:
#similar categories put into a dictionary
category_differences = {'Computers, peripherals, and smart home assistant devices':'Computers, peripherals, and smart home assistants',
                        'Airline fare':'Airline fares',"Women's underwear, nightwear, sportswear and accessories":"Women's underwear, nightwear, swimwear, and accessories",
                        "Sugar and artificial sweeteners":"Sugar and sugar substitutes",
                        "Photographers and film processing":"Photographers and photo processing",
                        'Other poultry including turkey':'Other uncooked poultry including turkey',
                    'Other pork including roasts and picnics':'Other pork including roasts, steaks, and ribs',
                        'Club dues and fees for participant sports and group exercises':
                        'Club membership for shopping clubs, fraternal, or other organizations, or participant sports fees',
                        'Video discs and other media, including rental of video and audio':
                        'Video discs and other media, including rental of video',
                       'Cable and satellite television and radio service':'Cable and satellite television service',
                        'Child care and nursery school' : 'Day care and preschool','Recorded music and music subscriptions':'Audio discs, tapes and other media',
                       "Men's underwear, nightwear, swimwear, and accessories":"Men's furnishings",
                       'Personal computers and peripheral equipment':'Computers, peripherals, and smart home assistant devices'}

#catgegories that did not have data for specific year
#'household operations', 'Leased cars and trucks','Domestic services'

In [11]:
def cpi_null(diction, cpi):
    cpi_copy = cpi.copy()
    #takes in lst of columns with null values
    for key,val in diction.items():
        if key in cpi_copy.columns:
            # create new dataframe for each column to merge
            key_column = pd.DataFrame(cpi_copy[key])
            val_column = pd.DataFrame(cpi_copy[val])
            
            #change the column name so that we can merge them
            val_column = val_column.rename(columns = {val:key})
            
            #outer merge of both dataframes
#             result_series = pd.merge(key_column, val_column, how = 'left')
            result_series = key_column.combine_first(val_column)

            cpi[key] = result_series[key]
            cpi = cpi.drop(columns =  [val], axis = 1)

    return cpi
#             cpi = pd.concat([cpi[key], result_series[key]], axis = 1)


            
#         if key in cpi.columns:

#             result = pd.concat([cpi_copy[key], cpi_copy[val]], axis=1)
#             cpi_copy[key] = pd.concat([cpi_copy[key], cpi_copy[val]], axis=1)
#             result = cpi_copy[key].merge(cpi_copy[val])
#             print(result)

In [12]:
cpi = cpi_null(category_differences,cpi)

In [13]:
cpi

Unnamed: 0,Food,Food at home,Cereals and bakery products,Cereals and cereal products,Flour and prepared flour mixes,Breakfast cereal,"Rice, pasta, cornmeal",Bakery products,Bread,"Fresh biscuits, rolls, muffins",...,Transportation,Private transportation,New and used motor vehicles,Utilities and public transportation,Household furnishings and operations,Other goods and services,Personal care,Recorded music and music subscriptions,"Men's underwear, nightwear, swimwear, and accessories",Residential telephone services
2012-01,232.666,231.694,266.677,234.159,252.159,228.664,242.992,283.880,170.658,167.911,...,210.799,206.307,99.659,205.521,125.629,391.382,210.299,89.691,148.547,
2012-02,232.486,231.180,267.821,233.362,250.564,227.984,242.217,286.484,172.978,168.364,...,214.429,210.013,99.889,205.398,126.180,391.236,210.330,89.896,148.183,
2012-03,232.792,231.383,267.101,232.660,252.104,227.997,238.975,285.771,172.955,168.406,...,220.842,216.536,100.325,205.637,126.107,392.364,211.289,89.704,149.106,
2012-04,233.234,231.711,268.014,233.662,252.102,228.862,240.693,286.589,173.559,166.211,...,223.083,218.563,100.977,206.050,126.114,393.320,211.865,89.084,150.584,
2012-05,233.339,231.518,268.653,235.054,254.336,230.326,241.615,286.629,173.581,167.158,...,220.768,215.978,101.399,206.866,125.905,392.859,211.649,88.904,155.044,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Monthly Percent Change 2021,0.500,0.400,0.500,1.200,0.700,1.400,1.100,0.100,0.000,-1.600,...,0.400,0.500,2.000,-0.200,1.100,0.500,0.500,0.400,-2.200,
2022-01,289.772,270.711,302.242,246.536,264.478,238.165,255.463,334.550,202.802,197.864,...,248.424,248.995,126.211,235.419,139.162,490.856,249.954,79.855,170.644,140.232
2022-02,292.794,274.568,306.193,249.590,266.133,240.812,259.560,339.033,203.586,202.667,...,253.150,253.525,126.822,236.442,140.306,496.045,252.862,78.968,174.732,140.970
2022-03,295.728,278.612,311.606,255.990,273.106,246.618,266.558,343.758,205.262,207.902,...,264.525,264.669,126.417,238.961,141.358,498.538,254.111,78.819,175.267,140.640


In [14]:
cpi.columns[cpi.isna().sum() > 0]

Index(['Food at employee sites and schools',
       'Personal computers and peripheral equipment', 'Household operations',
       'Domestic services', 'Gardening and lawncare services',
       'Repair of household items', 'Care of invalids and elderly at home',
       'Leased cars and trucks', 'Fees for lessons or instructions',
       'Land-line telephone services', 'Residential telephone services'],
      dtype='object')

In [15]:
cpi.isna().sum()

Food                                                       0
Food at home                                               0
Cereals and bakery products                                0
Cereals and cereal products                                0
Flour and prepared flour mixes                             0
                                                        ... 
Other goods and services                                   0
Personal care                                              0
Recorded music and music subscriptions                     0
Men's underwear, nightwear, swimwear, and accessories      0
Residential telephone services                           140
Length: 314, dtype: int64

In [16]:
#cleaned cpi data
cpi

Unnamed: 0,Food,Food at home,Cereals and bakery products,Cereals and cereal products,Flour and prepared flour mixes,Breakfast cereal,"Rice, pasta, cornmeal",Bakery products,Bread,"Fresh biscuits, rolls, muffins",...,Transportation,Private transportation,New and used motor vehicles,Utilities and public transportation,Household furnishings and operations,Other goods and services,Personal care,Recorded music and music subscriptions,"Men's underwear, nightwear, swimwear, and accessories",Residential telephone services
2012-01,232.666,231.694,266.677,234.159,252.159,228.664,242.992,283.880,170.658,167.911,...,210.799,206.307,99.659,205.521,125.629,391.382,210.299,89.691,148.547,
2012-02,232.486,231.180,267.821,233.362,250.564,227.984,242.217,286.484,172.978,168.364,...,214.429,210.013,99.889,205.398,126.180,391.236,210.330,89.896,148.183,
2012-03,232.792,231.383,267.101,232.660,252.104,227.997,238.975,285.771,172.955,168.406,...,220.842,216.536,100.325,205.637,126.107,392.364,211.289,89.704,149.106,
2012-04,233.234,231.711,268.014,233.662,252.102,228.862,240.693,286.589,173.559,166.211,...,223.083,218.563,100.977,206.050,126.114,393.320,211.865,89.084,150.584,
2012-05,233.339,231.518,268.653,235.054,254.336,230.326,241.615,286.629,173.581,167.158,...,220.768,215.978,101.399,206.866,125.905,392.859,211.649,88.904,155.044,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Monthly Percent Change 2021,0.500,0.400,0.500,1.200,0.700,1.400,1.100,0.100,0.000,-1.600,...,0.400,0.500,2.000,-0.200,1.100,0.500,0.500,0.400,-2.200,
2022-01,289.772,270.711,302.242,246.536,264.478,238.165,255.463,334.550,202.802,197.864,...,248.424,248.995,126.211,235.419,139.162,490.856,249.954,79.855,170.644,140.232
2022-02,292.794,274.568,306.193,249.590,266.133,240.812,259.560,339.033,203.586,202.667,...,253.150,253.525,126.822,236.442,140.306,496.045,252.862,78.968,174.732,140.970
2022-03,295.728,278.612,311.606,255.990,273.106,246.618,266.558,343.758,205.262,207.902,...,264.525,264.669,126.417,238.961,141.358,498.538,254.111,78.819,175.267,140.640


In [18]:
# save to csv
# cpi.to_csv('../CPI_Data/Cleaned_CPI_data_apr.csv')