In [1]:
import pandas as pd
import numpy as np
import pandera as pa
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")
from pandera import DataFrameSchema, Column, Check
from ydata_profiling import ProfileReport

In [None]:
raw = pd.read_csv("../data/raw/input.csv")

In [None]:
profile = ProfileReport(raw, title="Profiling Report")

In [None]:
profile.to_widgets()

In [None]:
profile.to_file("../output/profiling_report.html")

In [None]:
# Define the schema for the data
input_data_schema = DataFrameSchema(
    {
        'CompNo': Column(str, nullable=False, coerce=True),
        'StkIndx': Column(float, nullable=True, coerce=True),
        'STInt': Column(float, nullable=False, coerce=True),
        'dtdlevel': Column(float, nullable=True, coerce=True),
        'dtdtrend': Column(float, nullable=True, coerce=True),
        'liqnonfinlevel': Column(float, nullable=True, coerce=True),
        'liqnonfintrend': Column(float, nullable=True, coerce=True),
        'ni2talevel': Column(float, nullable=True, coerce=True),
        'ni2tatrend': Column(float, nullable=True, coerce=True),
        'sizelevel': Column(float, nullable=True, coerce=True),
        'sizetrend': Column(float, nullable=True, coerce=True),
        'm2b': Column(float, nullable=True, coerce=True),
        'sigma': Column(float, nullable=True, coerce=True),
        'DTDmedianNonFin': Column(float, nullable=True, coerce=True),
        'Company_name': Column(str, nullable=False, coerce=True),
        'INDUSTRY2': Column(str, nullable=True, coerce=True),
        'Date': Column(pa.DateTime, nullable=False, coerce=True),
        'Y': Column(pa.Category, checks=pa.Check.isin([0,1]), nullable=False, coerce=True),
        
    }
)

In [None]:
input_data_schema.validate(raw)

In [None]:
corr = raw.drop(columns=['CompNo', 'Company_name', 'INDUSTRY2', 'Date']).corr()

heatmap = go.Heatmap(z=corr,
                     x=corr.columns,
                     y=corr.columns,
                     colorscale='RdBu',
                     zmin=-1, zmax=1)

fig = go.Figure(data=[heatmap])

# Add annotations
annotations = []
for i, row in enumerate(corr.values):
    for j, value in enumerate(row):
        annotations.append(go.layout.Annotation(x=corr.columns[j], y=corr.columns[i], text=str(round(value, 2)), showarrow=False))
fig.update_layout(annotations=annotations)

fig.show()

In [4]:
pd.read_csv("../data/processed/summary_statistics.csv")

Unnamed: 0.1,Unnamed: 0,StkIndx,STInt,dtdlevel,dtdtrend,liqnonfinlevel,liqnonfintrend,ni2talevel,ni2tatrend,sizelevel,sizetrend,m2b,sigma,DTDmedianNonFin,Date
0,count,97381.0,97381.0,92114.0,92114.0,90168.0,90168.0,97381.0,97381.0,97381.0,97381.0,97381.0,97381.0,97381.0,97381
1,mean,0.064286,-0.005428,4.389687,-0.026637,0.463909,0.002275,0.013927,0.006432,0.482014,-0.024783,5.473755,0.172514,3.806439,2012-01-10 01:21:31.336092160
2,min,-0.24736,-0.011628,0.607629,-2.145232,-0.706834,-0.566069,-0.034921,-0.021029,-2.929091,-0.641217,0.498299,0.060186,2.054242,2000-01-31 00:00:00
3,25%,-0.011307,-0.011366,2.015259,-0.670383,-0.082028,-0.147445,-0.003918,-0.002438,-0.874668,-0.18589,0.767662,0.098361,2.969203,2006-09-30 00:00:00
4,50%,0.096765,-0.007939,3.398307,-0.021147,0.337872,-0.003774,0.002504,0.000273,0.524372,-0.009198,1.03664,0.144703,3.926915,2012-02-29 00:00:00
5,75%,0.153564,-0.002237,5.154352,0.550606,0.894563,0.112758,0.007485,0.00434,1.828284,0.155362,1.562645,0.216733,4.599985,2017-04-30 00:00:00
6,max,0.281292,0.009813,118.316968,15.157938,4.789012,2.712453,0.229865,0.106243,3.870103,0.512023,74.562839,0.431492,5.182964,2023-12-31 00:00:00
7,std,0.140667,0.0072,5.862846,1.19824,0.798335,0.325266,0.056521,0.026004,1.8825,0.286821,16.505342,0.098722,0.943457,
