In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set pandas display options to show all columns and full content
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Show full content of each column
pd.set_option('display.width', None)  # Don't wrap columns
pd.set_option('display.max_rows', 20)  # Show up to 20 rows


In [60]:
# Read the features.csv file with error handling for inconsistent columns
try:
    # First, let's examine the raw file to understand the structure
    with open('features.csv', 'r') as f:
        lines = f.readlines()
    
    print(f"Raw file has {len(lines)} lines")
    print(f"Header: {lines[0].strip()}")
    print(f"Line 2: {lines[1].strip()}")
    if len(lines) > 2:
        print(f"Line 3: {lines[2].strip()}")
    
    # Count commas in each line to understand the structure
    print(f"\nComma counts per line:")
    for i, line in enumerate(lines):
        comma_count = line.count(',')
        print(f"  Line {i+1}: {comma_count} commas")
    
    # Try to read with error handling
    df = pd.read_csv('features.csv', on_bad_lines='skip')
    
    print(f"\nDataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"\nFirst few rows:")
    df.head()
    
except Exception as e:
    print(f"Error reading CSV: {e}")
    print("\nTrying alternative approach...")
    
    # Alternative: read with different parameters
    try:
        df = pd.read_csv('features.csv', sep=',', header=0, on_bad_lines='skip', engine='python')
        print(f"Successfully loaded with Python engine")
        print(f"Dataset shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
    except Exception as e2:
        print(f"Alternative approach also failed: {e2}")
        print("Creating a clean CSV file...")
        
        # Create a clean CSV with just the enhanced data
        enhanced_lines = []
        for i, line in enumerate(lines):
            comma_count = line.count(',')
            if comma_count >= 25:  # Enhanced data lines have 27+ commas
                enhanced_lines.append(line.strip())
        
        if enhanced_lines:
            # Use the first enhanced line to get the header
            enhanced_header = enhanced_lines[0].split(',')[:17]  # Take first 17 columns for basic features
            enhanced_header.extend(['timestamp', 'spot_price', 'decision', 'confidence', 'ta_label', 'ta_conf', 
                                  'sentiment_composite', 'sentiment_recent', 'sentiment_slow', 'fused_score', 'news_sources'])
            
            # Create clean data
            clean_lines = [','.join(enhanced_header)]
            for line in enhanced_lines:
                parts = line.split(',')
                if len(parts) >= 28:
                    # Take the enhanced data parts
                    clean_line = ','.join(parts[:28])  # Take first 28 columns
                    clean_lines.append(clean_line)
            
            # Write clean CSV
            with open('features_clean.csv', 'w') as f:
                f.write('\n'.join(clean_lines))
            
            print(f"Created features_clean.csv with {len(clean_lines)-1} data rows")
            
            # Try to read the clean file
            try:
                df = pd.read_csv('features_clean.csv')
                print(f"Successfully loaded clean CSV: {df.shape}")
            except Exception as e3:
                print(f"Still having issues with clean CSV: {e3}")
        else:
            print("No enhanced data found in the file")


Raw file has 15 lines
Header: rsi14,macd_hist,ema8_above_ema20,ema20_above_ema50,momentum_1h,momentum_2h,momentum_4h,volume_ratio,bb_upper,bb_lower,bb_position,atr_ratio,stoch_k,stoch_d,market_trend,market_volatility,close,timestamp,spot_price,decision,confidence,ta_label,ta_conf,sentiment_composite,sentiment_recent,sentiment_slow,fused_score,news_sources
Line 2: 25.996886509335354,-110.87579950849096,-382.6855069620215,-207.72628995843115,-0.0031227410652468,-0.0014135065992118,-0.0048860399840393,0.309907554717331,117639.77776464458,115620.17023535544,0.0485489201355294,0.003085253126085,15.400619905305955,23.78853358190776,downtrend,low,115718.22,2025-09-19 17:39:58.443801+00:00,115718.22,Down,0.5450969653632955,bearish,0.95,0.5570677489314837,0.3,0.3571428571428571,-0.2950969653632955,19
Line 3: 25.00465632645069,-117.5453835540734,-395.9566180731345,-213.5811919192201,-0.0040230629188949,-0.0023153721345209,-0.0057847693281414,0.5357561941489751,117655.28677006792,115594.210229932

In [61]:
df

Unnamed: 0,rsi14,macd_hist,ema8_above_ema20,ema20_above_ema50,momentum_1h,momentum_2h,momentum_4h,volume_ratio,bb_upper,bb_lower,bb_position,atr_ratio,stoch_k,stoch_d,market_trend,market_volatility,close,timestamp,spot_price,decision,confidence,ta_label,ta_conf,sentiment_composite,sentiment_recent,sentiment_slow,fused_score,news_sources
0,25.996887,-110.8758,-382.685507,-207.72629,-0.003122741,-0.001414,-0.004886,0.309908,117639.777765,115620.170235,0.048549,0.003085,15.40062,23.788534,downtrend,low,115718.22,2025-09-19 17:39:58.443801+00:00,115718.22,Down,0.545097,bearish,0.95,0.557068,0.3,0.357143,-0.295097,19
1,25.004656,-117.545384,-395.956618,-213.581192,-0.004023063,-0.002315,-0.005785,0.535756,117655.28677,115594.21023,0.009461,0.003147,9.024015,21.662999,downtrend,low,115613.71,2025-09-19 17:52:39.052444+00:00,115613.71,Down,0.75,bearish,0.95,0.724317,0.5,0.333333,-0.238232,19
2,24.642339,-120.114683,-401.068999,-215.836654,-0.004369891,-0.002663,-0.006131,0.575106,117661.711919,115583.759081,-0.004961,0.003172,6.567579,20.844187,downtrend,low,115573.45,2025-09-19 17:58:45.726867+00:00,115573.45,Down,0.75,bearish,0.95,0.48155,0.25,0.3,-0.320773,18
3,24.79569,-121.760812,-432.821543,-281.997217,9.520091e-07,-0.004613,-0.002497,0.004591,117681.66214,115415.62986,0.057188,0.002937,4.880672,14.364563,downtrend,low,115545.22,2025-09-19 18:01:02.659695+00:00,115545.22,Down,0.75,bearish,0.95,0.331439,0.1,0.388889,-0.371811,21
4,25.46768,-120.171753,-429.659638,-280.602259,0.0002164523,-0.004399,-0.002282,0.061813,117678.310003,115421.471997,0.065866,0.00296,6.411068,14.874695,downtrend,low,115570.12,2025-09-19 18:09:07.941262+00:00,115570.12,Abstain,0.25,bearish,0.95,0.0,0.0,0.0,0.0,19
5,26.23586,-118.319764,-425.974558,-278.976488,0.0004676096,-0.004149,-0.002031,0.076402,117674.518793,115428.165207,0.076112,0.00298,8.194686,15.469235,downtrend,low,115599.14,2025-09-19T18:11:35.909216+00:00,115599.14,Abstain,0.25,bearish,0.95,0.0,0.0,0.0,0.0,20
6,23.763215,-129.402978,-448.027892,-288.7059,-0.00103544,-0.005645,-0.00353,0.196646,117699.026686,115386.290314,0.016941,0.003056,0.0,12.737673,downtrend,low,115425.47,2025-09-19T18:13:50.571225+00:00,115425.47,Abstain,0.25,bearish,0.95,0.0,0.0,0.0,0.0,22
7,24.155486,-126.416949,-442.086304,-286.084612,-0.0006304897,-0.005242,-0.003126,0.288312,117692.000813,115397.995187,0.032373,0.003055,2.806216,13.673078,downtrend,low,115472.26,2025-09-19T18:15:12.999092+00:00,115472.26,Abstain,0.25,bearish,0.95,0.0,0.0,0.0,0.0,21
8,24.750571,-122.067775,-433.432336,-282.266684,-4.067675e-05,-0.004655,-0.002538,0.291194,117682.320136,115414.490864,0.055524,0.003053,6.893491,15.035503,downtrend,low,115540.41,2025-09-19T18:16:40.451019+00:00,115540.41,Abstain,0.25,bearish,0.95,0.0,0.0,0.0,0.0,17
9,23.709562,-129.819069,-448.855828,-289.071166,-0.001091868,-0.005701,-0.003587,0.291276,117700.029723,115384.635277,0.01482,0.003061,0.0,12.737673,downtrend,low,115418.95,2025-09-19T18:18:21.857770+00:00,115418.95,Abstain,0.25,bearish,0.95,0.0,0.0,0.0,0.0,19


In [43]:
# Specifically show the timestamp column
if 'timestamp' in df.columns:
    print("Timestamp column details:")
    print(f"Data type: {df['timestamp'].dtype}")
    print(f"Number of unique timestamps: {df['timestamp'].nunique()}")
    print(f"\nAll timestamps:")
    for i, timestamp in enumerate(df['timestamp']):
        print(f"  Row {i}: {timestamp}")
    
    print(f"\nTimestamp column in context:")
    print(df[['timestamp', 'decision', 'confidence', 'sentiment_composite']].to_string())
else:
    print("No timestamp column found in the dataframe")


Timestamp column details:
Data type: object
Number of unique timestamps: 8

All timestamps:
  Row 0: 2025-09-19T14:05:32.858358+00:00
  Row 1: 2025-09-19T14:13:14.070189+00:00
  Row 2: 2025-09-19T14:14:38.474026+00:00
  Row 3: 2025-09-19T14:15:51.369052+00:00
  Row 4: 2025-09-19T14:17:01.894116+00:00
  Row 5: 2025-09-19T14:17:01.900780+00:00
  Row 6: 2025-09-19T14:18:24.737486+00:00
  Row 7: 2025-09-19T14:18:24.744455+00:00

Timestamp column in context:
                          timestamp decision  confidence  sentiment_composite
0  2025-09-19T14:05:32.858358+00:00  Abstain        0.25             0.000000
1  2025-09-19T14:13:14.070189+00:00  Abstain        0.25             0.000000
2  2025-09-19T14:14:38.474026+00:00  Abstain        0.25             0.000000
3  2025-09-19T14:15:51.369052+00:00  Abstain        0.25             0.000000
4  2025-09-19T14:17:01.894116+00:00  Abstain        0.25             0.000000
5  2025-09-19T14:17:01.900780+00:00     Down        0.75             0.604

In [35]:
# Check data types and basic info
print("Data Types:")
print(df.dtypes)
print("\n" + "="*50)
print("Basic Statistics:")
df.describe()


Data Types:
rsi14                  float64
macd_hist              float64
ema8_above_ema20       float64
ema20_above_ema50      float64
momentum_1h            float64
momentum_2h            float64
momentum_4h            float64
volume_ratio           float64
bb_upper               float64
bb_lower               float64
bb_position            float64
atr_ratio              float64
stoch_k                float64
stoch_d                float64
market_trend            object
market_volatility       object
close                  float64
timestamp               object
spot_price             float64
decision                object
confidence             float64
ta_label                object
ta_conf                float64
sentiment_composite    float64
sentiment_recent       float64
sentiment_slow         float64
fused_score            float64
news_sources             int64
dtype: object

Basic Statistics:


Unnamed: 0,rsi14,macd_hist,ema8_above_ema20,ema20_above_ema50,momentum_1h,momentum_2h,momentum_4h,volume_ratio,bb_upper,bb_lower,...,stoch_d,close,spot_price,confidence,ta_conf,sentiment_composite,sentiment_recent,sentiment_slow,fused_score,news_sources
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,41.86569,-82.174029,-96.620723,191.321572,0.001026,-0.000495,-0.001241,0.70773,117835.672603,116383.050797,...,26.170726,116747.69,116747.69,0.426302,0.726801,0.167377,0.083333,0.158929,0.061657,19.5
std,11.714294,41.116378,226.058196,212.900312,0.000942,0.001577,0.000917,0.425005,42.564713,374.915628,...,23.183418,503.267519,503.267519,0.229162,0.067546,0.298406,0.225668,0.219586,0.376324,0.707107
min,31.563571,-111.559329,-273.37566,23.289228,-3.4e-05,-0.003041,-0.002789,0.117829,117783.359537,116080.606752,...,9.712131,116298.59,116298.59,0.25,0.634308,0.0,-0.166667,0.0,-0.492893,19.0
25%,33.535706,-107.161494,-264.624867,27.149872,0.000128,-0.000888,-0.001478,0.359769,117821.741286,116094.463799,...,11.365018,116367.5025,116367.5025,0.25,0.676038,0.0,0.0,0.0,0.0,19.0
50%,35.197305,-103.208468,-256.759152,30.620041,0.001087,-0.00024,-0.000849,0.836868,117828.106029,116105.91858,...,12.850727,116429.445,116429.445,0.25,0.734725,0.0,0.0,0.0,0.0,19.0
75%,47.957213,-82.617106,60.392678,407.796446,0.001793,0.000417,-0.000727,1.110235,117835.682248,116762.26872,...,28.79125,117169.27,117169.27,0.675744,0.747904,0.183689,0.0,0.334821,0.0,20.0
max,60.892844,-6.743189,250.428785,467.384064,0.002576,0.001458,8e-06,1.120113,117907.75928,116870.728463,...,67.976862,117467.07,117467.07,0.742893,0.821488,0.734072,0.5,0.5,0.742371,21.0


In [23]:
# Convert timestamp to datetime and check the data
if 'timestamp' in df.columns:
    print("Converting timestamp to datetime...")
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    print(f"✓ Timestamp converted successfully")
    print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
    print(f"Number of predictions: {len(df)}")
    
    # Show the signal information
    print(f"\nSignal Analysis:")
    print(f"Decisions: {df['decision'].value_counts().to_dict()}")
    print(f"Confidence range: {df['confidence'].min():.3f} to {df['confidence'].max():.3f}")
    print(f"TA Labels: {df['ta_label'].value_counts().to_dict()}")
    print(f"News sources: {df['news_sources'].unique()}")
else:
    print("No timestamp column found - this appears to be the old format")
    print("Run the script with --schedule to generate enhanced data")


Converting timestamp to datetime...
✓ Timestamp converted successfully
Date range: 2025-09-18 20:23:16.661054+00:00 to 2025-09-18 20:23:16.669677+00:00
Number of predictions: 3

Signal Analysis:
Decisions: {'Up': 2, 'Abstain': 1}
Confidence range: 0.250 to 0.800
TA Labels: {'bullish': 3}
News sources: [20]


In [24]:
# Show the actual data with timestamps
print("Current data with timestamps:")
print(f"Shape: {df.shape}")
print(f"\nTimestamps:")
for i, ts in enumerate(df['timestamp']):
    print(f"  Row {i}: {ts}")

print(f"\nDecisions and Confidence:")
for i, row in df.iterrows():
    print(f"  Row {i}: {row['decision']} (conf: {row['confidence']:.3f}) at {row['timestamp']}")

print(f"\nTechnical Analysis:")
print(f"  RSI: {df['rsi14'].iloc[0]:.2f}")
print(f"  MACD: {df['macd_hist'].iloc[0]:.2f}")
print(f"  Market Trend: {df['market_trend'].iloc[0]}")
print(f"  Market Volatility: {df['market_volatility'].iloc[0]}")


Current data with timestamps:
Shape: (3, 28)

Timestamps:
  Row 0: 2025-09-18 20:23:16.661054+00:00
  Row 1: 2025-09-18 20:23:16.666167+00:00
  Row 2: 2025-09-18 20:23:16.669677+00:00

Decisions and Confidence:
  Row 0: Abstain (conf: 0.250) at 2025-09-18 20:23:16.661054+00:00
  Row 1: Up (conf: 0.699) at 2025-09-18 20:23:16.666167+00:00
  Row 2: Up (conf: 0.800) at 2025-09-18 20:23:16.669677+00:00

Technical Analysis:
  RSI: 60.89
  MACD: -6.74
  Market Trend: uptrend
  Market Volatility: low


In [25]:
# Convert timestamp to datetime if it exists
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
    print(f"Number of predictions: {len(df)}")
else:
    print("No timestamp column found - this appears to be the old format")
    print("Run the script with --schedule to generate enhanced data")


Date range: 2025-09-18 20:23:16.661054+00:00 to 2025-09-18 20:23:16.669677+00:00
Number of predictions: 3


In [27]:
df

Unnamed: 0,rsi14,macd_hist,ema8_above_ema20,ema20_above_ema50,momentum_1h,momentum_2h,momentum_4h,volume_ratio,bb_upper,bb_lower,...,spot_price,decision,confidence,ta_label,ta_conf,sentiment_composite,sentiment_recent,sentiment_slow,fused_score,news_sources
0,60.892844,-6.743189,250.428785,467.384064,-3.4e-05,-0.003041,-0.000755,0.117829,117907.75928,116762.26872,...,117467.07,Abstain,0.25,bullish,0.747904,0.0,0.0,0.0,0.0,20
1,60.892844,-6.743189,250.428785,467.384064,-3.4e-05,-0.003041,-0.000755,0.117829,117907.75928,116762.26872,...,117467.07,Up,0.698742,bullish,0.747904,0.734072,0.5,0.375,0.742371,20
2,60.892844,-6.743189,250.428785,467.384064,-3.4e-05,-0.003041,-0.000755,0.117829,117907.75928,116762.26872,...,117467.07,Up,0.8,bullish,0.747904,0.734072,0.5,0.375,0.742371,20


In [None]:
# Check for signal information
signal_columns = ['decision', 'confidence', 'ta_label', 'ta_conf', 'sentiment_composite', 'fused_score']
available_signal_cols = [col for col in signal_columns if col in df.columns]

if available_signal_cols:
    print("Signal information available:")
    for col in available_signal_cols:
        print(f"  {col}: {df[col].unique() if df[col].dtype == 'object' else f'{df[col].min():.3f} to {df[col].max():.3f}'}")
else:
    print("No signal information found - this appears to be basic technical features only")


In [None]:
# Display the current data
print("Current dataset:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nSample data:")
df.head()


In [None]:
# Analyze technical features
tech_features = ['rsi14', 'macd_hist', 'momentum_1h', 'momentum_2h', 'momentum_4h', 
                'volume_ratio', 'bb_position', 'atr_ratio', 'stoch_k', 'stoch_d']

# Plot distributions of technical features
fig, axes = plt.subplots(2, 5, figsize=(20, 10))
axes = axes.ravel()

for i, feature in enumerate(tech_features):
    if feature in df.columns:
        axes[i].hist(df[feature], bins=20, alpha=0.7)
        axes[i].set_title(f'{feature} Distribution')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# Analyze signals if available
if 'decision' in df.columns:
    print("Signal Distribution:")
    print(df['decision'].value_counts())
    
    if 'confidence' in df.columns:
        plt.figure(figsize=(12, 4))
        
        plt.subplot(1, 2, 1)
        df['confidence'].hist(bins=20, alpha=0.7)
        plt.title('Confidence Distribution')
        plt.xlabel('Confidence')
        plt.ylabel('Frequency')
        
        plt.subplot(1, 2, 2)
        df.boxplot(column='confidence', by='decision')
        plt.title('Confidence by Decision')
        plt.suptitle('')
        
        plt.tight_layout()
        plt.show()


In [None]:
# Correlation matrix for technical features
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()


In [None]:
# Time series analysis if timestamps are available
if 'timestamp' in df.columns and 'close' in df.columns:
    plt.figure(figsize=(15, 8))
    
    plt.subplot(2, 1, 1)
    plt.plot(df['timestamp'], df['close'], marker='o', markersize=4)
    plt.title('BTC Price Over Time')
    plt.xlabel('Time')
    plt.ylabel('Price ($)')
    plt.xticks(rotation=45)
    
    if 'confidence' in df.columns:
        plt.subplot(2, 1, 2)
        plt.plot(df['timestamp'], df['confidence'], marker='o', markersize=4, color='red')
        plt.title('Prediction Confidence Over Time')
        plt.xlabel('Time')
        plt.ylabel('Confidence')
        plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
