# Cell 1: Import and Setup

In [None]:
import pandas as pd
import os # For file path operations
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)  # Display all columns in DataFrame

#Paths
RAW_PATH = "../data/raw/"
PROCESSED_PATH = "../data/processed/"

os.makedirs(PROCESSED_PATH, exist_ok=True)  # Create processed directory if it doesn't exist


# Cell 2: Loading and Merging all CSVs

In [None]:
# List all CSV files in raw folder
files = [f for f in os.listdir(RAW_PATH) if f.endswith(".csv")]

dfs = []
for file in files:
    df = pd.read_csv(os.path.join(RAW_PATH, file)) # Read each CSV file
    df["company"] = file.replace(".csv", "")  # Add a column for company name
    dfs.append(df)

# Merger all DataFrames
data = pd.concat(dfs, ignore_index=True)

print("Merged Dataset Shape:", data.shape)
data.head()

# Cell 3: Post 2016 data (date choice is optional)

In [None]:
data["Date"] = pd.to_datetime(data["Date"])  # Convert date column to datetime

data = data[data["Date"] >= "2016-01-01"].reset_index(drop=True)  # Filter data from 2016 onwards

print("Post-2016 Dataset Shape:", data.shape)
data.head()

# Cell 4: Pre-processing and EDA