<a href="https://colab.research.google.com/github/willxpet/python_miniprojects/blob/main/logisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import pandas as pd

# Upload dataset as "df"
df = pd.read_csv('/content/sample_data/logisticRegression_toyDataset.csv')

# Split comma-separated values into separate columns
df = df.rename(columns={df.columns[0]: "raw"})
df = df["raw"].str.split(",", expand=True)

# Name the newly created columns
df.columns = [
    "stock_id",
    "date",
    "next_month_return",
    "factor1",
    "factor2",
    "factor3",
    "factor4",
    "factor5",
    "win"
]

# Ensure data recognised in correct formats
df["date"] = pd.to_datetime(df["date"])
df["next_month_return"] = df["next_month_return"].astype(float)
df["factor1"] = df["factor1"].astype(float)
df["factor2"] = df["factor2"].astype(float)
df["factor3"] = df["factor3"].astype(float)
df["factor4"] = df["factor4"].astype(float)
df["factor5"] = df["factor5"].astype(float)
df["win"] = df["win"].astype(int)

# Explore dataframe pre-analysis
df.shape
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   stock_id           50 non-null     object        
 1   date               50 non-null     datetime64[ns]
 2   next_month_return  50 non-null     float64       
 3   factor1            50 non-null     float64       
 4   factor2            50 non-null     float64       
 5   factor3            50 non-null     float64       
 6   factor4            50 non-null     float64       
 7   factor5            50 non-null     float64       
 8   win                50 non-null     int64         
dtypes: datetime64[ns](1), float64(6), int64(1), object(1)
memory usage: 3.6+ KB


Unnamed: 0,stock_id,date,next_month_return,factor1,factor2,factor3,factor4,factor5,win
0,S001,2015-01-31,-0.0048,0.5,-0.14,0.65,1.52,-0.23,0
1,S001,2015-02-28,0.0321,1.01,-0.58,-0.53,-0.57,-0.92,0
2,S001,2015-03-31,0.1247,-1.01,0.31,-0.91,-1.41,1.47,0
3,S001,2015-04-30,-0.0198,0.39,0.12,-0.51,-0.6,0.94,1
4,S001,2015-05-31,0.0064,-0.01,-1.06,0.82,-1.22,0.2,0


In [39]:
# Define variables for logistic regression
## Target value (dependent variable)
y = df['win']

## Independent variables
feature_cols = ['factor1','factor2','factor3','factor4','factor5']
X = df[feature_cols]

# Sanity check target distribution
df['win'].value_counts()

Unnamed: 0_level_0,count
win,Unnamed: 1_level_1
0,26
1,24


In [48]:
# Split data set (70% = training set; 30% = test set)

## Split data by dates, reversing order so newest at top. This breaks
## grouping of data by stock which is OK because we're using
## cross-sectional data for patterns, not time-series paths.
df = df.sort_values('date').reset_index(drop=True)
df.head()

## Choose split point 70:30
split_index = int(len(df) * 0.7)

## Create split point
X_train = X.iloc[:split_index]
y_train = y.iloc[:split_index]

X_test = X.iloc[split_index:]
y_test = y.iloc[split_index:]

# Sanity check - size of splits
X_train.shape, X_test.shape
y_train.value_counts()

# Sanity check - date order
#df['date'].head()
#df['date'].tail()

Unnamed: 0_level_0,count
win,Unnamed: 1_level_1
0,19
1,16


In [50]:
# Run logistic regression
import statsmodels.api as sm

## Add intercept
X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)

#!! Why do we only use x constant in model fit??
#!! Which summary results matter and how to interpret?

## Fit model
logit_model = sm.Logit(y_train,X_train_const)
result = logit_model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.349934
         Iterations 8


0,1,2,3
Dep. Variable:,win,No. Observations:,35.0
Model:,Logit,Df Residuals:,29.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 04 Dec 2025",Pseudo R-squ.:,0.4925
Time:,12:49:14,Log-Likelihood:,-12.248
converged:,True,LL-Null:,-24.131
Covariance Type:,nonrobust,LLR p-value:,0.0002406

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.1788,0.674,-1.748,0.080,-2.501,0.143
factor1,3.7852,1.417,2.672,0.008,1.009,6.562
factor2,1.1621,1.036,1.122,0.262,-0.869,3.193
factor3,1.6315,0.989,1.650,0.099,-0.306,3.569
factor4,-0.0400,0.619,-0.065,0.948,-1.253,1.173
factor5,-0.2088,0.762,-0.274,0.784,-1.703,1.285


In [52]:
# Interpret coefficients as odds ratios
## Odds ratio = exp(coeff)
import numpy as np
np.exp(result.params)

### odds ratio > 1 --> increases likelihood of win
### odds ratio < 1 --> decreases likelihood
### values further from 1 have stronger influence. i.e. factor importance

Unnamed: 0,0
const,0.307659
factor1,44.045081
factor2,3.196674
factor3,5.111437
factor4,0.960773
factor5,0.811519


In [60]:
# Generate predictions
## Predicted probabilities as a float between 0 and 1
y_pred_prob = result.predict(X_test_const)
#!! what is this and how does it work? It's a probability?

## Class predictions (binary; 0 = lose, 1 = win)
y_pred_class = (y_pred_prob >=0.5).astype(int)
print(y_pred_class)

#!! I'm lost here - what is .5 for? are we predicting x or y?

35    1
36    0
37    1
38    0
39    1
40    0
41    1
42    0
43    0
44    0
45    1
46    0
47    1
48    0
49    1
dtype: int64


In [66]:
# Evaluate model performance
## Accuracy
print((y_pred_class == y_test).mean())
#!! what the fuck is this?


## Confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred_class))
#!! what the fuck is this?


# ROC AUC
#!! what the fuck is this?
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred_prob)

0.9333333333333333
[[7 0]
 [1 7]]


np.float64(0.9821428571428572)