In [7]:
import requests
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline

In [3]:
# Your Polygon API key
API_KEY = 'xDoEKdH8gCRADKFy5hDGAq36frjsqge_'

def fetch_data(symbol, start_date, end_date):
    url = f'https://api.polygon.io/v2/aggs/ticker/{symbol}/range/1/minute/{start_date}/{end_date}?apiKey={API_KEY}'
    response = requests.get(url)
    if response.status_code == 200:
        return pd.DataFrame(response.json()['results'])
    else:
        print(f"Failed to fetch data for {symbol} on {start_date} to {end_date}")
        return pd.DataFrame()

# Symbols you're interested in (including an S&P 500 ETF for approximation)
symbols = ['NVDA', 'GOOGL', 'AAPL', 'SPY'] # SPY is an ETF that tracks the S&P 500
start_date = '2023-01-03'
end_date = '2023-01-10'

# Fetch and store data for each symbol
data_frames = {}
for symbol in symbols:
    df = fetch_data(symbol, start_date, end_date)
    if not df.empty:
        # Convert timestamp to readable date and time format
        df['timestamp'] = pd.to_datetime(df['t'], unit='ms')
        df.set_index('timestamp', inplace=True)
        # Calculate minute-by-minute returns
        df['returns'] = df['c'].pct_change() * 100 # 'c' is the closing price
        data_frames[symbol] = df

# Example: Check data for NVDA
if 'NVDA' in data_frames:
    print(data_frames['NVDA'].head())


                        v        vw       o       c       h       l  \
timestamp                                                             
2023-01-03 09:00:00  2711  149.3257  150.70  148.95  151.00  148.17   
2023-01-03 09:02:00   388  148.8909  148.94  148.94  148.94  148.94   
2023-01-03 09:03:00   609  148.8008  148.85  148.85  148.85  148.85   
2023-01-03 09:04:00  1155  148.8968  148.81  148.95  148.96  148.81   
2023-01-03 09:05:00   409  148.9411  148.95  148.95  148.95  148.95   

                                 t   n   returns  
timestamp                                         
2023-01-03 09:00:00  1672736400000  87       NaN  
2023-01-03 09:02:00  1672736520000  11 -0.006714  
2023-01-03 09:03:00  1672736580000  26 -0.060427  
2023-01-03 09:04:00  1672736640000  28  0.067182  
2023-01-03 09:05:00  1672736700000  13  0.000000  


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5000 entries, 2023-01-03 09:00:00 to 2023-01-10 23:31:00
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   v        5000 non-null   float64
 1   vw       5000 non-null   float64
 2   o        5000 non-null   float64
 3   c        5000 non-null   float64
 4   h        5000 non-null   float64
 5   l        5000 non-null   float64
 6   t        5000 non-null   int64  
 7   n        5000 non-null   int64  
 8   returns  4999 non-null   float64
dtypes: float64(7), int64(2)
memory usage: 390.6 KB
None


In [6]:
# Define numerical columns (assuming all columns except 'returns' are features)
numerical_cols = ['v', 'vw', 'o', 'c', 'h', 'l', 't', 'n']  # Update if necessary

# Create a preprocessing pipeline for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  # It's good practice to scale features for models like SVM or logistic regression
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
    ])


In [8]:
X = df[numerical_cols]  # Features
y = df['returns'].apply(lambda x: 1 if x > 0 else 0)  # Target variable (1 if stock went up, 0 otherwise)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model pipeline
model_pipeline = make_pipeline(preprocessor, DecisionTreeClassifier(random_state=42))

# Define the grid search parameters
param_grid = {
    'decisiontreeclassifier__max_depth': [None, 10, 20, 30],
    'decisiontreeclassifier__min_samples_split': [2, 10, 20]
}

# Setup grid search
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='accuracy')

# Fit model
grid_search.fit(X_train, y_train)

# Best model evaluation
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

Best parameters: {'decisiontreeclassifier__max_depth': None, 'decisiontreeclassifier__min_samples_split': 10}
Best score: 0.666
              precision    recall  f1-score   support

           0       0.71      0.71      0.71       532
           1       0.67      0.67      0.67       468

    accuracy                           0.69      1000
   macro avg       0.69      0.69      0.69      1000
weighted avg       0.69      0.69      0.69      1000

