## Config


In [1]:
import pandas as pd
import numpy as np

## Data

In [2]:
df = pd.read_excel('M3C.xls', sheet_name='M3Month')
df.head()

Unnamed: 0,Series,N,NF,Category,Starting Year,Starting Month,1,2,3,4,...,135,136,137,138,139,140,141,142,143,144
0,N1402,68,18,MICRO,1990,1,2640.0,2640.0,2160.0,4200.0,...,,,,,,,,,,
1,N1403,68,18,MICRO,1990,1,1680.0,1920.0,120.0,1080.0,...,,,,,,,,,,
2,N1404,68,18,MICRO,1990,1,1140.0,720.0,4860.0,1200.0,...,,,,,,,,,,
3,N1405,68,18,MICRO,1990,1,180.0,940.0,2040.0,800.0,...,,,,,,,,,,
4,N1406,68,18,MICRO,1990,1,2000.0,1550.0,4450.0,3050.0,...,,,,,,,,,,


In [3]:
df['Series'] = df['Series'].str.replace('N ', '')
df['Series'] = df['Series'].str.replace('N', '').astype(int)

## Melt from wide format to long format

In [5]:
# Using pandas.melt to transform the DataFrame
melted_df = pd.melt(df, id_vars=['Series', 'N', 'NF', 'Category', 'Starting Year', 'Starting Month'], 
                    var_name='Measurement', value_name='Value')

In [6]:
melted_df = melted_df.sort_values(by=['Series', 'Category'])

In [10]:
print(melted_df['Starting Month'].unique())
print(melted_df['Starting Month'].median())

[ 1 10 12  3  6 11  4  7  2  5  9  8  0]
1.0


In [9]:
print(melted_df['Starting Year'].median())

1983.0


Handle data that does not have any starting year or month, replace zero with median

In [11]:
melted_df['Starting Year'] = melted_df['Starting Year'].replace(0, melted_df['Starting Year'].median())
melted_df['Starting Month'] = melted_df['Starting Month'].replace(0, melted_df['Starting Month'].median())

In [12]:
melted_df.loc[melted_df['Series'] == 2801]

Unnamed: 0,Series,N,NF,Category,Starting Year,Starting Month,Measurement,Value
1399,2801,71,18,OTHER,1983,1,1,8139.0
2827,2801,71,18,OTHER,1983,1,2,8332.2
4255,2801,71,18,OTHER,1983,1,3,8321.6
5683,2801,71,18,OTHER,1983,1,4,8374.7
7111,2801,71,18,OTHER,1983,1,5,8389.6
...,...,...,...,...,...,...,...,...
199891,2801,71,18,OTHER,1983,1,140,
201319,2801,71,18,OTHER,1983,1,141,
202747,2801,71,18,OTHER,1983,1,142,
204175,2801,71,18,OTHER,1983,1,143,


In [13]:
## Do rstrip in the cateogry column
melted_df['Category'] = melted_df['Category'].str.rstrip()
print(melted_df['Category'].unique())

['MICRO' 'INDUSTRY' 'MACRO' 'FINANCE' 'DEMOGRAPHIC' 'OTHER']


In [14]:
melted_df.dropna(subset=['Value'], inplace=True)
melted_df.reset_index(drop=True, inplace=True)

In [15]:
melted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167562 entries, 0 to 167561
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Series          167562 non-null  int64  
 1   N               167562 non-null  int64  
 2   NF              167562 non-null  int64  
 3   Category        167562 non-null  object 
 4   Starting Year   167562 non-null  int64  
 5   Starting Month  167562 non-null  int64  
 6   Measurement     167562 non-null  object 
 7   Value           167562 non-null  float64
dtypes: float64(1), int64(5), object(2)
memory usage: 10.2+ MB


In [16]:
melted_df

Unnamed: 0,Series,N,NF,Category,Starting Year,Starting Month,Measurement,Value
0,1402,68,18,MICRO,1990,1,1,2640.0
1,1402,68,18,MICRO,1990,1,2,2640.0
2,1402,68,18,MICRO,1990,1,3,2160.0
3,1402,68,18,MICRO,1990,1,4,4200.0
4,1402,68,18,MICRO,1990,1,5,3360.0
...,...,...,...,...,...,...,...,...
167557,2829,71,18,OTHER,1983,1,67,1282.5
167558,2829,71,18,OTHER,1983,1,68,1261.3
167559,2829,71,18,OTHER,1983,1,69,1263.4
167560,2829,71,18,OTHER,1983,1,70,1257.1


## Add a datetime column

In [17]:
# Ensure 'Measurement' is an integer
melted_df['Measurement'] = melted_df['Measurement'].astype(int)

# Calculate the total months from the starting point
total_months = (melted_df['Starting Year'] - 1) * 12 + melted_df['Starting Month'] + melted_df['Measurement'] - 1

# Calculate the actual year and month
actual_year = (total_months - 1) // 12 + 1
actual_month = (total_months - 1) % 12 + 1

# Now, create the datetime column with the correct year, month, and a fixed day (01)
melted_df['Date'] = pd.to_datetime(actual_year.astype(str) + '-' + actual_month.astype(str) + '-01')

# Display the head of the DataFrame to verify the changes
melted_df.head()


Unnamed: 0,Series,N,NF,Category,Starting Year,Starting Month,Measurement,Value,Date
0,1402,68,18,MICRO,1990,1,1,2640.0,1990-01-01
1,1402,68,18,MICRO,1990,1,2,2640.0,1990-02-01
2,1402,68,18,MICRO,1990,1,3,2160.0,1990-03-01
3,1402,68,18,MICRO,1990,1,4,4200.0,1990-04-01
4,1402,68,18,MICRO,1990,1,5,3360.0,1990-05-01


In [18]:
melted_df.drop(columns = ['Starting Year','Starting Month'], inplace = True)

In [19]:
melted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167562 entries, 0 to 167561
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Series       167562 non-null  int64         
 1   N            167562 non-null  int64         
 2   NF           167562 non-null  int64         
 3   Category     167562 non-null  object        
 4   Measurement  167562 non-null  int64         
 5   Value        167562 non-null  float64       
 6   Date         167562 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(4), object(1)
memory usage: 8.9+ MB


In [20]:
melted_df.to_parquet('M3_month_processed.parquet', index=False, compression='snappy', engine='pyarrow')