# Review Data


In [1]:
import yfinance as yf
import pandas as pd
import sys
!{sys.executable} -m pip install pandas-ta
import pandas_ta as ta



In [20]:
df = yf.download(tickers = 'AAPL', start = '2023-01-01', end = '2023-12-31', interval = '1d')
df.head(5)

[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2023-01-03,123.211205,128.954553,122.324579,128.343772,112117500
2023-01-04,124.482033,126.747853,123.221057,125.004155,89113600
2023-01-05,123.161942,125.871071,122.905811,125.240583,80962700
2023-01-06,127.693588,128.353629,123.03389,124.137247,87754700
2023-01-09,128.215683,131.427242,127.959553,128.530934,70790800


In [21]:
df.describe()

Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
count,250.0,250.0,250.0,250.0,250.0
mean,170.510818,171.803594,168.962667,170.222273,59223550.0
std,17.340363,17.27221,17.538701,17.53631,17783320.0
min,123.161942,125.871071,122.324579,124.137247,24048300.0
25%,159.947018,160.218348,158.553386,159.39697,47812080.0
50%,173.604637,175.138089,171.987046,173.377061,55110850.0
75%,185.426708,186.05988,182.954159,184.564629,65742920.0
max,196.256577,197.752444,195.156959,196.167422,154357300.0


In [16]:
tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA']
data_list = []
# Loop through each ticker and download data from each one 
for ticker in tickers: 
    df = yf.download(tickers = ticker, start = '2023-01-01', end = '2023-12-31', interval = '1d')
    # Check if a column (df.columns) is multi-index (pd.MultiIndex) or not
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = df.columns.get_level_values(0)
    
    # Initialize a new column 'RSI'
    """RSI stands for Relative Strength Index, it measures the velocity and
    magnitude of directional price movements to make predictions based on 
    ratio of upward movements to downward movements over a specified time period (ussually 14 days),
    it ranges from 0 to 100.
    
    If RSI > 70: Overbought condition (potentially overvalued, may see a price pullback)
    If RSI < 30: Oversold condition (potentially undervalued, may see a price bounce)
    """
    # Calculate RSI in 14 days period
    df['RSI'] = ta.rsi(df['Close'], length=14)    
    # Initialize a new column 'SMA_50'
    """
    SMA stands for Simple Moving Average, it is the average closing price over a specific period (in this case, 50 days).
    
    
    """

    df['SMA_50'] = ta.sma(df['Close'], length=50)
    
    # Create a target variable based on future price movement: value = 1 if price increases compared to yesterday, else 0
    df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)
    # Add a column for ticker symbol
    df['Ticker'] = ticker
    # Drop rows with NaN values
    df.dropna(inplace=True)
    # Append the processed DataFrame to the list
    data_list.append(df)
    
    final_df = pd.concat(data_list, axis=0)
    print(final_df.head(), '\n')
    print(f"Row totals: {len(final_df)}", '\n')
    

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Price            Close        High         Low        Open    Volume  \
Date                                                                   
2023-03-15  150.946396  151.202917  147.917397  149.170437  77167900   
2023-03-16  153.768158  154.370010  149.614389  150.127447  76161100   
2023-03-17  152.929504  154.646267  152.219121  153.995080  98944600   
2023-03-20  155.297440  155.711842  152.090853  152.998577  73641400   
2023-03-21  157.152344  157.270736  154.448939  155.218534  73938300   

Price             RSI      SMA_50  Target Ticker  
Date                                              
2023-03-15  59.145568  142.745689       1   AAPL  
2023-03-16  63.449901  143.356828       0   AAPL  
2023-03-17  61.380042  143.925777       1   AAPL  
2023-03-20  64.865176  144.568487       1   AAPL  
2023-03-21  67.350700  145.157662       0   AAPL   

Row totals: 201 

Price            Close        High         Low        Open    Volume  \
Date                                          




In [17]:
final_df['Ticker'].value_counts()

Ticker
AAPL     201
MSFT     201
GOOGL    201
AMZN     201
TSLA     201
Name: count, dtype: int64

In [21]:
final_df.shape


(1005, 9)

In [20]:
final_df.head()


Price,Close,High,Low,Open,Volume,RSI,SMA_50,Target,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-03-15,150.946396,151.202917,147.917397,149.170437,77167900,59.145568,142.745689,1,AAPL
2023-03-16,153.768158,154.37001,149.614389,150.127447,76161100,63.449901,143.356828,0,AAPL
2023-03-17,152.929504,154.646267,152.219121,153.99508,98944600,61.380042,143.925777,1,AAPL
2023-03-20,155.29744,155.711842,152.090853,152.998577,73641400,64.865176,144.568487,1,AAPL
2023-03-21,157.152344,157.270736,154.448939,155.218534,73938300,67.3507,145.157662,0,AAPL


In [23]:
final_df['Target'].value_counts(normalize=True)

Target
1    0.543284
0    0.456716
Name: proportion, dtype: float64

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler

# Features and target variables
X = final_df[['RSI', 'SMA_50', 'Close', 'Volume']]
y = final_df['Target']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42, stratify = y)

# Standardize the feature variables
scaler = StandardScaler()
# Standardize the training data by calculating the z-scores
X_train_scaled = scaler.fit_transform(X_train)
# Standardize the testing data using the same parameters as training data
X_test_scaled = scaler.transform(X_test)