## Config


In [1]:
import pandas as pd
import numpy as np

## Data

In [16]:
df = pd.read_excel('M3C.xls', sheet_name='M3Quart')
df.head()

Unnamed: 0,Series,N,NF,Category,Starting Year,Starting Quarter,1,2,3,4,...,63,64,65,66,67,68,69,70,71,72
0,N 646,44,8,MICRO,1984,1,3142.63,3190.75,3178.69,3170.94,...,,,,,,,,,,
1,N 647,44,8,MICRO,1984,1,1522.0,1537.8,1602.6,1697.0,...,,,,,,,,,,
2,N 648,46,8,MICRO,1983,3,1549.72,1615.08,1620.33,1634.41,...,,,,,,,,,,
3,N 649,44,8,MICRO,1984,1,2182.6,2189.8,2136.7,2155.6,...,,,,,,,,,,
4,N 650,44,8,MICRO,1984,1,875.51,917.23,999.89,1014.7,...,,,,,,,,,,


In [17]:
df['Series'] = df['Series'].str.replace('N ', '')
df['Series'] = df['Series'].str.replace('N', '').astype(int)

## Melt from wide format to long format

In [18]:
# Using pandas.melt to transform the DataFrame
melted_df = pd.melt(df, id_vars=['Series', 'N', 'NF', 'Category', 'Starting Year', 'Starting Quarter'], 
                    var_name='Measurement', value_name='Value')

In [19]:
melted_df = melted_df.sort_values(by=['Series', 'Category'])

NB! Change one data flaw that says that one the series has starting quarter equal to 9, fix this mistake

In [20]:
melted_df['Starting Quarter'] = melted_df['Starting Quarter'].replace(9, 1)

In [21]:
## Do rstrip in the cateogry column
melted_df['Category'] = melted_df['Category'].str.rstrip()
print(melted_df['Category'].unique())

['MICRO' 'INDUSTRY' 'MACRO' 'FINANCE' 'DEMOGRAPHIC']


In [22]:
melted_df.dropna(subset=['Value'], inplace=True)
melted_df.reset_index(drop=True, inplace=True)

In [23]:
melted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37004 entries, 0 to 37003
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Series            37004 non-null  int64  
 1   N                 37004 non-null  int64  
 2   NF                37004 non-null  int64  
 3   Category          37004 non-null  object 
 4   Starting Year     37004 non-null  int64  
 5   Starting Quarter  37004 non-null  int64  
 6   Measurement       37004 non-null  object 
 7   Value             37004 non-null  float64
dtypes: float64(1), int64(5), object(2)
memory usage: 2.3+ MB


In [24]:
melted_df

Unnamed: 0,Series,N,NF,Category,Starting Year,Starting Quarter,Measurement,Value
0,646,44,8,MICRO,1984,1,1,3142.63
1,646,44,8,MICRO,1984,1,2,3190.75
2,646,44,8,MICRO,1984,1,3,3178.69
3,646,44,8,MICRO,1984,1,4,3170.94
4,646,44,8,MICRO,1984,1,5,3124.38
...,...,...,...,...,...,...,...,...
36999,1401,48,8,DEMOGRAPHIC,1964,1,44,2490.00
37000,1401,48,8,DEMOGRAPHIC,1964,1,45,2070.00
37001,1401,48,8,DEMOGRAPHIC,1964,1,46,2000.00
37002,1401,48,8,DEMOGRAPHIC,1964,1,47,1650.00


## Add a datetime column

In [25]:
# Calculate the actual year and quarter, considering the 'Measurement' as the quarter offset
# First, calculate the total quarters from the starting point
total_quarters = (melted_df['Starting Year'] - 1) * 4 + melted_df['Starting Quarter'] + melted_df['Measurement'] - 1

# Calculate the actual year and quarter
actual_year = (total_quarters - 1) // 4 + 1
actual_quarter = (total_quarters - 1) % 4 + 1

# Map the actual quarter to the corresponding starting month of each quarter
month_map = {1: '01', 2: '04', 3: '07', 4: '10'}
actual_month = actual_quarter.map(month_map)

# Now, create the datetime column with the correct year, month, and fixed day (01)
melted_df['Date'] = pd.to_datetime(actual_year.astype(str) + '-' + actual_month + '-01')

melted_df.head()


Unnamed: 0,Series,N,NF,Category,Starting Year,Starting Quarter,Measurement,Value,Date
0,646,44,8,MICRO,1984,1,1,3142.63,1984-01-01
1,646,44,8,MICRO,1984,1,2,3190.75,1984-04-01
2,646,44,8,MICRO,1984,1,3,3178.69,1984-07-01
3,646,44,8,MICRO,1984,1,4,3170.94,1984-10-01
4,646,44,8,MICRO,1984,1,5,3124.38,1985-01-01


In [26]:
melted_df.drop(columns = ['Starting Year','Starting Quarter'], inplace = True)

In [27]:
melted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37004 entries, 0 to 37003
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Series       37004 non-null  int64         
 1   N            37004 non-null  int64         
 2   NF           37004 non-null  int64         
 3   Category     37004 non-null  object        
 4   Measurement  37004 non-null  object        
 5   Value        37004 non-null  float64       
 6   Date         37004 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(3), object(2)
memory usage: 2.0+ MB


In [30]:
melted_df.loc[melted_df['Series'] == 764]

Unnamed: 0,Series,N,NF,Category,Measurement,Value,Date
5202,764,44,8,MICRO,1,2711.1,1984-01-01
5203,764,44,8,MICRO,2,2933.95,1984-04-01
5204,764,44,8,MICRO,3,3046.8,1984-07-01
5205,764,44,8,MICRO,4,3294.8,1984-10-01
5206,764,44,8,MICRO,5,3141.15,1985-01-01
5207,764,44,8,MICRO,6,3306.25,1985-04-01
5208,764,44,8,MICRO,7,3373.35,1985-07-01
5209,764,44,8,MICRO,8,4037.6,1985-10-01
5210,764,44,8,MICRO,9,3758.95,1986-01-01
5211,764,44,8,MICRO,10,3833.0,1986-04-01


In [29]:
melted_df.to_parquet('M3_quarter_processed.parquet', index=False, compression='snappy', engine='pyarrow')