In [1]:
!pip install yfinance
!pip install pyspark
!pip install delta-spark


Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=c238664675db8f9105d19214e8d2696c637724876f60383272ad4efb37ae57b9
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3
Collecting delta-spark
  Downloading delta_spark-3.2.1-py3-none-any.whl.metadata (1.9 kB)
Downloading delta_spark-3.2.1-py3-none-any.whl (21 kB)
Installing collected packages: delta-spark
Successfully installed delta-spark-3.2.1


In [2]:
import yfinance as yf
import pandas as pd
import numpy as np


**GETTING DATA OF REQUIRED COMPANY FROM THE YAHOO API**

DATA GATHERING STEP

In [3]:
import yfinance as yf
import pandas as pd
import os

def getData(ticker, filename='combined_stock_data.csv'):
    # Download stock data for the given ticker symbol
    stock_data = yf.download(ticker, start='2023-01-01', end='2023-09-27')

    # Rename columns to the desired format
    stock_data.rename(columns={
        'High': 'High',
        'Low': 'Low',
        'Open': 'Open',
        'Close': 'Close',
        'Adj Close': 'Adj Close',
        'Volume': 'Volume'
    }, inplace=True)

    # Reset index to get the 'Date' column as a separate column
    stock_data.reset_index(inplace=True)

    # Add a 'Ticker' column to identify which stock the data belongs to
    stock_data['ticker_symbol'] = ticker

    # Select only the relevant columns
    formatted_data = stock_data[['Date', 'ticker_symbol', 'High', 'Volume', 'Low', 'Open', 'Close', 'Adj Close']]

    # If the file already exists, append data; otherwise, create the file
    if os.path.exists(filename):
        # Append mode
        formatted_data.to_csv(filename, mode='a', header=False, index=False)
    else:
        # Write mode with header
        formatted_data.to_csv(filename, index=False)

    print(f"Data for {ticker} appended to {filename}")

# Example Usage: Append data for multiple tickers
getData('WIT')     # For Wipro's stock data
getData('INFY')   # For Google's stock data
getData('GOOGL')
getData('MSFT')   # For Microsoft's stock data
getData('ACN')
getData('CTSH')

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Data for WIT appended to combined_stock_data.csv
Data for INFY appended to combined_stock_data.csv


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Data for GOOGL appended to combined_stock_data.csv
Data for MSFT appended to combined_stock_data.csv


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Data for ACN appended to combined_stock_data.csv
Data for CTSH appended to combined_stock_data.csv





In [4]:
def preprocess(df2):
    ticker_list={
        "WIT":1,
        "INFY":2,
        "GOOGL":3,
        "MSFT":4,
        "ACN":5,
        "CTSH":6
    }
    # replace missing values with NULL
    df2 = df2.replace(r'/^s*$', np.nan, regex=True)

    # count number of NULLS in each column
    # print(df2.isna().sum(axis=0))

    # Now find rows which are having 3 or more than 3 nulls

    #print((df2.isna().sum(axis=1) >= 3).sum())
    k1 = df2[df2.isna().sum(axis=1)>= 3]
    k1 = k1.index.to_list()
    for x in k1:
        df2 = df2.drop(x)

    # checking whether the data is dropped
    #print((df2.isna().sum(axis=1) >= 3).sum())

    # Finding number of rows with NULL values
    #print(df2["Volume"].isna().sum())
    # Dropping the rows
    x = df2[df2["Volume"].isna()]
    l = x.index.to_list()
    for x in l:
        df2 = df2.drop(x)

       # Checking the volume column
    #print(df2["Volume"].isna().sum())

    # filling the null values with corresponding means
    df2["Low"] = df2["Low"].fillna(df2["Low"].mean())
    df2["High"] = df2["High"].fillna(df2["High"].mean())
    df2["Open"] = df2["Open"].fillna(df2["Open"].mean())
    df2["Close"] = df2["Close"].fillna(df2["Close"].mean())
    df2["Adj Close"] = df2["Adj Close"].fillna(df2["Adj Close"].mean())

    # Data Cleaning part is completed now we should transform the data into the required format
    ''' The table format we have in the table is
    Stock_id,company_id,Stock_prize,Trading_volume,RSI,MarketCap,Date
    '''
    df2["moving_avg"]=df2["Close"].rolling(window=14).mean()
    #df2.iloc[0:13,5]=df2.iloc[14,5]

    window_length=14
    relative_close_diff=df2["Close"].diff()

    gain=relative_close_diff.where(relative_close_diff>0,0)
    loss=-relative_close_diff.where(relative_close_diff<0,0)

    avg_gain=gain.rolling(window_length,min_periods=1).mean()
    avg_loss =loss.rolling(window_length, min_periods=1).mean()

    rs=avg_gain/avg_loss

    df2["RSI"]=100-(100/(1+rs))

    df=pd.DataFrame()
    df["Stock_id"]=[x for x in range(1,len(df2))]
    df['Company_id'] = df2['ticker_symbol'].map(ticker_list)
    df["Stock_Price"]=(df2["Open"]+df2["Close"]+df2["Low"]+df2["High"])/4
    df["Trading_Volume"]=df2["Volume"]
    df['Moving_Average']=df2['moving_avg']
    df["Market_Cap"]=df2["Adj Close"]
    df["RSI"]=df2["RSI"]
    df["Record_Time"]=df2["Date"]
    df.dropna(inplace=True)
    df.iloc[0:2,4]=df.iloc[2,4]

    # Checking Whether The Null Values are filled
    #print(df2.isna().sum())
    #print("**********************************************************************************")
    return df
df1=pd.read_csv("combined_stock_data.csv")
df1=preprocess(df1)
print(df1)
df1.to_csv("cleaned_data.csv",index=False)

      Stock_id  Company_id  Stock_Price  Trading_Volume  Moving_Average  \
13          14           1     4.992500         3438800        4.851429   
14          15           1     4.970000         4061100        4.851429   
15          16           1     4.897500         3277000        4.851429   
16          17           1     4.925000         4023400        4.876429   
17          18           1     4.895000         1837200        4.893571   
...        ...         ...          ...             ...             ...   
1098      1099           6    69.795000         5046500       70.950000   
1099      1100           6    70.137503         3550100       70.808571   
1100      1101           6    69.327499         2585000       70.637143   
1101      1102           6    69.469999         2048400       70.485714   
1102      1103           6    69.557501         3463800       70.397143   

      Market_Cap        RSI Record_Time  
13      4.967611  70.588206  2023-01-23  
14      4.96960