In [1]:
# Importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Importing train dataset as df

df = pd.read_csv("infolimpioavanzadoTarget.csv")
df.shape

(3247, 1285)

In [3]:
df.head()

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker,RSIadjclose15,RSIvolume15,...,high-15,K-15,D-15,stochastic-k-15,stochastic-d-15,stochastic-kd-15,volumenrelativo,diff,INCREMENTO,TARGET
0,2022-01-03,17.799999,18.219,17.5,17.76,17.76,106600,ASLE,,,...,,,,,,,0.919758,-1.900001,-9.664295,0.0
1,2022-01-04,17.700001,18.309999,17.620001,17.66,17.66,128700,ASLE,,,...,,,,,,,1.11044,-1.379999,-7.247895,0.0
2,2022-01-05,17.58,17.799999,16.91,16.950001,16.950001,103100,ASLE,,,...,,,,,,,0.88956,-0.93,-5.201344,0.0
3,2022-01-06,16.65,16.879999,16.139999,16.17,16.17,173600,ASLE,,,...,,,,,,,1.497843,-0.36,-2.177856,0.0
4,2022-01-07,16.219999,16.290001,15.63,15.71,15.71,137800,ASLE,,,...,,,,,,,1.188956,-0.12,-0.758054,0.0


In [4]:
df.isna().sum()

date                  0
open                  0
high                  0
low                   0
close                 0
                   ... 
stochastic-kd-15    300
volumenrelativo     216
diff                 61
INCREMENTO           61
TARGET                1
Length: 1285, dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3247 entries, 0 to 3246
Columns: 1285 entries, date to TARGET
dtypes: float64(1281), int64(2), object(2)
memory usage: 31.8+ MB


In [6]:
df.select_dtypes(include=['object'])

Unnamed: 0,date,ticker
0,2022-01-03,ASLE
1,2022-01-04,ASLE
2,2022-01-05,ASLE
3,2022-01-06,ASLE
4,2022-01-07,ASLE
...,...,...
3242,2022-12-01,ASTE
3243,2022-12-02,ASTE
3244,2022-12-05,ASTE
3245,2022-12-06,ASTE


In [7]:
# Number of Infinity values

np.isinf(df.drop(['date', 'ticker'], axis=1)).values.sum()

57877

In [8]:
# Replacing infinity values to na values

df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [9]:
df['ticker'] = pd.Categorical(df['ticker'])
df['TARGET'] = pd.Categorical(df['TARGET'])

In [10]:
# Replacing na values in each numeric column with its mean

col_num = df.select_dtypes(include=['float'])

for column in col_num:
    df[column].fillna(np.mean(df[column]), inplace=True)

In [11]:
# Replacing na values in each categorical column with its mean

col_cat = df[['ticker', 'TARGET']]

for column in col_cat:
    df[column].fillna(df[column].mode(), inplace=True)

In [12]:
# Label Encoding categorical variables into integers

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['ticker'] = le.fit_transform(df['ticker'])

In [13]:
df['TARGET'].value_counts()

0.0    2651
1.0     595
Name: TARGET, dtype: int64

In [14]:
# Creating new dataset with equal number of categories from dependent variable

df_sample = pd.concat([df[df['TARGET'] == 0].sample(1424), df[df['TARGET'] == 1].sample(1424)])

ValueError: ignored

In [None]:
df_sample['TARGET'].value_counts()

In [None]:
# Defining Independent and Dependent variables

X = df_sample.drop(['date'], axis=1)
y = df_sample['TARGET']

In [None]:
# Splitting into Train and Test sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

In [None]:
# Applying Scaling on independent variables

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Appying Logistic Regression Model onto Train set

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
y_pred = log_reg.predict(X_test)
y_pred

In [None]:
y_test.value_counts()

In [None]:
# Evaluating Model by Confusion Matrix and Accuracy Score

from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
# Performing Cross Validation on Train sets

from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator= log_reg, X= X_train, y= y_train, cv= 10)
print('Mean Accuracy : {:.2f} %'.format(accuracies.mean()*100))
print('Standard Deviation : {:.2f} %'.format(accuracies.std()*100))
print(accuracies)