In [1]:
import pandas as pd
import numpy as np
import pandera as pa
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")
from pandera import DataFrameSchema, Column, Check
from ydata_profiling import ProfileReport

In [2]:
raw = pd.read_csv("../data/raw/input.csv")

In [None]:
profile = ProfileReport(raw, title="Profiling Report")

In [None]:
profile.to_widgets()

In [None]:
profile.to_file("../output/profiling_report.html")

In [None]:
# Define the schema for the data
input_data_schema = DataFrameSchema(
    {
        'CompNo': Column(str, nullable=False, coerce=True),
        'StkIndx': Column(float, nullable=True, coerce=True),
        'STInt': Column(float, nullable=False, coerce=True),
        'dtdlevel': Column(float, nullable=True, coerce=True),
        'dtdtrend': Column(float, nullable=True, coerce=True),
        'liqnonfinlevel': Column(float, nullable=True, coerce=True),
        'liqnonfintrend': Column(float, nullable=True, coerce=True),
        'ni2talevel': Column(float, nullable=True, coerce=True),
        'ni2tatrend': Column(float, nullable=True, coerce=True),
        'sizelevel': Column(float, nullable=True, coerce=True),
        'sizetrend': Column(float, nullable=True, coerce=True),
        'm2b': Column(float, nullable=True, coerce=True),
        'sigma': Column(float, nullable=True, coerce=True),
        'DTDmedianNonFin': Column(float, nullable=True, coerce=True),
        'Company_name': Column(str, nullable=False, coerce=True),
        'INDUSTRY2': Column(str, nullable=True, coerce=True),
        'Date': Column(pa.DateTime, nullable=False, coerce=True),
        'Y': Column(pa.Category, checks=pa.Check.isin([0,1]), nullable=False, coerce=True),
        
    }
)

In [None]:
input_data_schema.validate(raw)

In [4]:
corr = raw.drop(columns=['CompNo', 'Company_name', 'INDUSTRY2', 'Date']).corr()

heatmap = go.Heatmap(z=corr,
                     x=corr.columns,
                     y=corr.columns,
                     colorscale='RdBu',
                     zmin=-1, zmax=1)

fig = go.Figure(data=[heatmap])

# Add annotations
annotations = []
for i, row in enumerate(corr.values):
    for j, value in enumerate(row):
        annotations.append(go.layout.Annotation(x=corr.columns[j], y=corr.columns[i], text=str(round(value, 2)), showarrow=False))
fig.update_layout(annotations=annotations)

fig.show()