In [19]:
import pandas as pd
import numpy as np
from pathlib import Path
import sqlite3

In [21]:
# 1. Paths
csv_path = Path("data") / "raw" / "data.csv"
out_path = Path("data") / "processed" / "data_processed.csv"

print("Reading:", csv_path.resolve())
if not csv_path.exists():
    raise FileNotFoundError(f"CSV not found at {csv_path.resolve()}")

# 2. Read raw file with NO header; we'll build header/data manually
raw = pd.read_csv(csv_path, header=None)
print("Raw shape:", raw.shape)

# Row 0: header names like Price,Close,High,Low,Open,Volume
# Row 1: ticker row -> drop
# Row 2+: actual data
header = raw.iloc[0].tolist()
data   = raw.iloc[2:].reset_index(drop=True)
data.columns = header

# 3. Rename first column (Price) to Date and parse
if "Price" not in data.columns:
    raise KeyError(f"'Price' column not found; got {data.columns.tolist()}")

data = data.rename(columns={"Price": "Date"})
data["Date"] = pd.to_datetime(data["Date"], errors="coerce")
data = data.dropna(subset=["Date"]).sort_values("Date").reset_index(drop=True)

# 4. Ensure numeric OHLCV
num_cols = ["Close", "High", "Low", "Open", "Volume"]
for c in num_cols:
    if c not in data.columns:
        raise KeyError(f"Column '{c}' not found after cleaning; columns = {data.columns.tolist()}")
    data[c] = pd.to_numeric(data[c], errors="coerce")

data = data.dropna(subset=num_cols).reset_index(drop=True)

# 5. RSI helper
def compute_rsi(series: pd.Series, window: int = 14) -> pd.Series:
    delta = series.diff()
    gain  = np.where(delta > 0, delta, 0.0)
    loss  = np.where(delta < 0, -delta, 0.0)
    gain_ewm = pd.Series(gain, index=series.index).ewm(span=window, adjust=False).mean()
    loss_ewm = pd.Series(loss, index=series.index).ewm(span=window, adjust=False).mean()
    rs  = gain_ewm / loss_ewm
    return 100 - (100 / (1 + rs))

# 6. Technical indicators
data["SMA50"]  = data["Close"].rolling(window=50,  min_periods=50).mean()
data["SMA200"] = data["Close"].rolling(window=200, min_periods=200).mean()
data["EMA50"]  = data["Close"].ewm(span=50,  adjust=False, min_periods=50).mean()
data["EMA200"] = data["Close"].ewm(span=200, adjust=False, min_periods=200).mean()
data["RSI14"]  = compute_rsi(data["Close"], window=14)

# 7. Final selection and drop NaNs from indicators
keep_cols = ["Date", "Close", "Open", "High", "Low", "Volume",
             "SMA50", "SMA200", "EMA50", "EMA200", "RSI14"]
data = data[keep_cols].dropna().reset_index(drop=True)

print("Processed rows:", len(data))
print("Columns:", data.columns.tolist())
print(data.head())


# 8. Save to SQLite
sqlite_path = Path("data") / "processed" / "data_processed.sqlite"
sqlite_path.parent.mkdir(parents=True, exist_ok=True)

with sqlite3.connect(sqlite_path) as conn:
    # table name 'data'; replace if it already exists
    data.to_sql("data", conn, if_exists="replace", index=False)

print("Saved processed SQLite DB to:", sqlite_path.resolve())

Reading: C:\Users\soltv\Documents\GitHub\newalgotrade\data\raw\data.csv
Raw shape: (6525, 6)
Processed rows: 6323
Columns: ['Date', 'Close', 'Open', 'High', 'Low', 'Volume', 'SMA50', 'SMA200', 'EMA50', 'EMA200', 'RSI14']
        Date     Close      Open      High       Low      Volume     SMA50  \
0 2000-10-16  0.322404  0.334588  0.348646  0.320529   820176000  0.702868   
1 2000-10-17  0.301785  0.325216  0.328964  0.295225   601720000  0.694527   
2 2000-10-18  0.301785  0.291475  0.315843  0.281166   834265600  0.686542   
3 2000-10-19  0.283978  0.287258  0.297099  0.274606  1506724800  0.677976   
4 2000-10-20  0.292412  0.285852  0.305533  0.283978   791263200  0.669559   

     SMA200     EMA50    EMA200      RSI14  
0  0.794551  0.633340  0.764040  26.893359  
1  0.791863  0.620338  0.759440  24.368773  
2  0.789529  0.607846  0.754886  24.368773  
3  0.787050  0.595145  0.750201  21.994769  
4  0.784951  0.583273  0.745646  25.938032  
Saved processed SQLite DB to: C:\Users\s

  data["Date"] = pd.to_datetime(data["Date"], errors="coerce")
