In [8]:
import sys
import os
sys.path.append(os.path.abspath("../src"))

import pandas as pd
from data_preprocessing import load_and_clean_trader_data, load_and_clean_sentiment_data


In [9]:
# --- Load both datasets ---

trader_path = r"C:\Users\nisha\Downloads\historical_data.csv"
sentiment_path = r"C:\Users\nisha\Downloads\fear_greed_index.csv"

trader_df = load_and_clean_trader_data(trader_path)
sentiment_df = load_and_clean_sentiment_data(sentiment_path)

print("=== Trader Data (first 5 rows) ===")
display(trader_df.head())

print("\n=== Sentiment Data (first 5 rows) ===")
display(sentiment_df.head())


=== Trader Data (first 5 rows) ===


Unnamed: 0,Account,Coin,Execution Price,Size Tokens,Size USD,Side,Timestamp IST,Start Position,Direction,Closed PnL,Transaction Hash,Order ID,Crossed,Fee,Trade ID,Timestamp,Date
0,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9769,986.87,7872.16,BUY,2024-02-12 22:50:00,0.0,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.345404,895000000000000.0,1730000000000.0,2024-02-12
1,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.98,16.0,127.68,BUY,2024-02-12 22:50:00,986.524596,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.0056,443000000000000.0,1730000000000.0,2024-02-12
2,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9855,144.09,1150.63,BUY,2024-02-12 22:50:00,1002.518996,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.050431,660000000000000.0,1730000000000.0,2024-02-12
3,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9874,142.98,1142.04,BUY,2024-02-12 22:50:00,1146.558564,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.050043,1080000000000000.0,1730000000000.0,2024-02-12
4,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9894,8.73,69.75,BUY,2024-02-12 22:50:00,1289.488521,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.003055,1050000000000000.0,1730000000000.0,2024-02-12



=== Sentiment Data (first 5 rows) ===


Unnamed: 0,timestamp,value,classification,Date
0,1517463000,30,Fear,2018-02-01
1,1517549400,15,Extreme Fear,2018-02-02
2,1517635800,40,Fear,2018-02-03
3,1517722200,24,Extreme Fear,2018-02-04
4,1517808600,11,Extreme Fear,2018-02-05


In [10]:
# --- Shapes and Data Info ---

print("Trader Data Shape:", trader_df.shape)
print("Sentiment Data Shape:", sentiment_df.shape)

print("\n=== Trader Data Info ===")
print(trader_df.info())

print("\n=== Sentiment Data Info ===")
print(sentiment_df.info())


Trader Data Shape: (211224, 17)
Sentiment Data Shape: (2644, 4)

=== Trader Data Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211224 entries, 0 to 211223
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Account           211224 non-null  object        
 1   Coin              211224 non-null  object        
 2   Execution Price   211224 non-null  float64       
 3   Size Tokens       211224 non-null  float64       
 4   Size USD          211224 non-null  float64       
 5   Side              211224 non-null  object        
 6   Timestamp IST     79225 non-null   datetime64[ns]
 7   Start Position    211224 non-null  float64       
 8   Direction         211224 non-null  object        
 9   Closed PnL        211224 non-null  float64       
 10  Transaction Hash  211224 non-null  object        
 11  Order ID          211224 non-null  int64         
 12  Crossed           211224

In [11]:
# --- Missing values check ---

print("\nMissing Values in Trader Data:\n")
print(trader_df.isna().sum())

print("\nMissing Values in Sentiment Data:\n")
print(sentiment_df.isna().sum())



Missing Values in Trader Data:

Account                  0
Coin                     0
Execution Price          0
Size Tokens              0
Size USD                 0
Side                     0
Timestamp IST       131999
Start Position           0
Direction                0
Closed PnL               0
Transaction Hash         0
Order ID                 0
Crossed                  0
Fee                      0
Trade ID                 0
Timestamp                0
Date                131999
dtype: int64

Missing Values in Sentiment Data:

timestamp         0
value             0
classification    0
Date              0
dtype: int64


In [12]:
# --- Numeric summaries ---

print("\n=== Trader Data Description ===")
display(trader_df.describe())

print("\n=== Sentiment Value Stats ===")
display(sentiment_df['value'].describe())



=== Trader Data Description ===


Unnamed: 0,Execution Price,Size Tokens,Size USD,Timestamp IST,Start Position,Closed PnL,Order ID,Fee,Trade ID,Timestamp
count,211224.0,211224.0,211224.0,79225,211224.0,211224.0,211224.0,211224.0,211224.0,211224.0
mean,11414.72335,4623.365,5639.451,2025-05-01 09:10:16.361502208,-29946.25,48.749001,69653880000.0,1.163967,562854900000000.0,1737744000000.0
min,5e-06,8.74e-07,0.0,2023-01-05 01:06:00,-14334630.0,-117990.1041,173271100.0,-1.175712,0.0,1680000000000.0
25%,4.8547,2.94,193.79,2025-02-02 13:02:00,-376.2311,0.0,59838530000.0,0.016121,281000000000000.0,1740000000000.0
50%,18.28,32.0,597.045,2025-06-02 19:46:00,84.72793,0.0,74429390000.0,0.089578,562000000000000.0,1740000000000.0
75%,101.58,187.9025,2058.96,2025-10-01 03:49:00,9337.278,5.792797,83355430000.0,0.393811,846000000000000.0,1740000000000.0
max,109004.0,15822440.0,3921431.0,2025-12-04 23:55:00,30509480.0,135329.0901,90149230000.0,837.471593,1130000000000000.0,1750000000000.0
std,29447.654868,104272.9,36575.14,,673807.4,919.164828,18357530000.0,6.758854,325756500000000.0,8689920000.0



=== Sentiment Value Stats ===


count    2644.000000
mean       46.981089
std        21.827680
min         5.000000
25%        28.000000
50%        46.000000
75%        66.000000
max        95.000000
Name: value, dtype: float64

In [13]:
# --- Save cleaned data for later use ---

os.makedirs("../data/processed", exist_ok=True)
trader_df.to_csv("../data/processed/trader_clean.csv", index=False)
sentiment_df.to_csv("../data/processed/sentiment_clean.csv", index=False)

print("✅ Cleaned CSVs saved in /data/processed/")

✅ Cleaned CSVs saved in /data/processed/
