# Dependency

In [4]:
import pandas as pd
import numpy as np
from pandas_datareader import data as wb # datareader supports multiple financial database including yahoo and google
import datetime
from dateutil.relativedelta import relativedelta
import GetOldTweets3 as tws
import tweepy

# Data Source
## Start & End Date

In [5]:
date_rang_month = 3
start_date = datetime.date.today() + relativedelta(months=-date_rang_month)
end_date = datetime.date.today()

## Stock Data

In [46]:
tickers = \
[
    ("TSLA", "yahoo"), # 0, TESLA Stock
    ("DJIA", "fred"), # 1, Dow Jones Index Average
    ("DFF", "fred"), # 2, Federal Funds Rate
    ("PCRFY", "yahoo"), # 3, Panasonic Corp. Stock
    ("BMW.DE", "yahoo"), # 4, BMW Stock
    ("DAI.DE", "yahoo"), # 5, Daimler AG Stock
    ("XPEV", "yahoo"), # 6, XPeng Inc. Stock
    ("BYDDF", "yahoo"), # 7, BYD Company Limited Stock
    ("BZ=F", "yahoo") # 8, Brent Crude Oil Last Day Financ
]

stock_dfs = list()
for ticker, source in tickers:
    if source == "fred":
        df = pd.DataFrame(wb.DataReader(ticker, 
                                        data_source = source, 
                                        start = start_date + relativedelta(days = -1), 
                                        end = end_date))
    else:
        df = pd.DataFrame(wb.DataReader(ticker, 
                                        data_source = source, 
                                        start = start_date, 
                                        end = end_date))
    stock_dfs.append(df)

## Number of Tesla vehicles delivered worldwide 

In [44]:
quaters = [
    (r'q2', r'2019'),
    (r'q3', r'2019'),
    (r'q4', r'2019'),
    (r'q1', r'2020'),
    (r'q2', r'2020'),
    (r'q3', r'2020')
]

deliver_df = list()
for q, year in quaters:
    url = r'https://ir.tesla.com/press-release/tesla-' + q + r'-' + year + r'-vehicle-production-deliveries'
    tables = pd.read_html(url)
    tmp = list()
    for row in range(1,4):
        for col in range(1, 3):
            if q == r'q2' and year == r'2019':
                tmp.append(tables[0].iloc[row, col + 1])
            else:
                tmp.append(tables[0].iloc[row, col])
    deliver_df.append(tmp)
deliver_df = pd.DataFrame(deliver_df,
                          columns= [
                              "Production Model S/X",
                              "Deliveries Model S/X",
                              "Production Model 3",
                              "Deliveries Model 3",
                              "Production Total",
                              "Deliveries Total",
                         ])
deliver_df
    

Unnamed: 0,Production Model S/X,Deliveries Model S/X,Production Model 3,Deliveries Model 3,Production Total,Deliveries Total
0,14517,17650,72531,77550,87048,95200
1,16318,17400,79837,79600,96155,97000
2,17933,19450,86958,92550,104891,112000
3,15390,12200,87282,76200,102672,88400
4,6326,10600,75946,80050,82272,90650
5,16992,15200,128044,124100,145036,139300


## Tesla Revenue

In [45]:
revenue_df = pd.read_html(r'https://www.macrotrends.net/stocks/charts/TSLA/tesla/revenue')
revenue_df = revenue_df[1]
revenue_df.dropna(how = "any")
revenue_df.head(5)

Unnamed: 0,Tesla Quarterly Revenue(Millions of US $),Tesla Quarterly Revenue(Millions of US $).1
0,2020-09-30,"$8,771"
1,2020-06-30,"$6,036"
2,2020-03-31,"$5,985"
3,2019-12-31,"$7,384"
4,2019-09-30,"$6,303"


# Data Cleaning

Currently, the data used includes:

1. Tesla Stock `tsla_df`
2. Dow Jones Index Average `djia_df`
3. Federal Funds Rate `dff_df`
4. Panasonic Corp. Stock `pcrfy_df`
5. BMW Stock `bwm_df`
6. Daimler AG Stock `dai_df`
7. XPeng Inc. Stock `xpev_df`
8. BYD Company Limited Stock `byddf_df`
9. Brent Crude Oil Last Day Financ `oil_df`
10. Number of Tesla vehicles delivered worldwide `deliver_df`
11. Tesla Revenue `revenue_df`

Actually, in the last section, we have done a bit of data Cleaning, like

1. drop nan/missing values 
2. reset the index of dataframe to include `date` as one column in the dataframe

However, we still need to do more:

1. Renaming columns to a more recognizable set of labels 
2. Dropping unnecessary columns in a DataFrame
3. do some more modification on `deliver_df` and `revenue_df`
4. Combine all these dataframe to a complete DataFrame

## Clean `revenue_df`

In [47]:
# rename the dataframe
revenue_df.columns = ["Date", "Tesla Quarterly Revenue(Millions of US $)"]

# drop rows with missing values
for col in revenue_df.columns:
    if np.sum(revenue_df[col].isna()) != 0:
        print("WARNING: for revenue_df, column: " + col + " appears missing values")
        revenue_df = revenue_df[revenue_df[col].notna()]

# transfer data type of "Date" from string to TimeStamp
revenue_df["Date"] = revenue_df["Date"].map(pd.Timestamp)

# transfer data type of "Tesla Quarterly Revenue(Millions of US $)" from string to decimal
def doller2decimal(target):
    return float(target.replace(",", "").replace("$", ""))
revenue_df["Tesla Quarterly Revenue(Millions of US $)"] = revenue_df["Tesla Quarterly Revenue(Millions of US $)"].apply(doller2decimal)

# reorder the revenue_df to ascending order by Date
revenue_df = revenue_df.sort_values(by = ["Date"], ascending = True)

revenue_df.tail(5)




Unnamed: 0,Date,Tesla Quarterly Revenue(Millions of US $)
4,2019-09-30,6303.0
3,2019-12-31,7384.0
2,2020-03-31,5985.0
1,2020-06-30,6036.0
0,2020-09-30,8771.0


## Clean `deliver_df`

In [48]:
# add "Date" to the dataframe deliver_df
deliver_df["Date"] = revenue_df["Date"][-len(deliver_df):].reset_index()["Date"]
deliver_df = deliver_df[deliver_df.columns.to_list()[-1:] + deliver_df.columns.to_list()[:-1]] # reorder the columns

deliver_df

Unnamed: 0,Date,Production Model S/X,Deliveries Model S/X,Production Model 3,Deliveries Model 3,Production Total,Deliveries Total
0,2019-06-30,14517,17650,72531,77550,87048,95200
1,2019-09-30,16318,17400,79837,79600,96155,97000
2,2019-12-31,17933,19450,86958,92550,104891,112000
3,2020-03-31,15390,12200,87282,76200,102672,88400
4,2020-06-30,6326,10600,75946,80050,82272,90650
5,2020-09-30,16992,15200,128044,124100,145036,139300


## Clean & Combine Stocks DFs

In [33]:
# construct a list including all the dataframe for stock data, federal funds rate and Dow Jones index

for df in stock_dfs:
    new_col_names = list()
    for col in df.columns.to_list():

        

High
Low
Open
Close
Volume
Adj Close
DJIA
DFF
High
Low
Open
Close
Volume
Adj Close
High
Low
Open
Close
Volume
Adj Close
High
Low
Open
Close
Volume
Adj Close
High
Low
Open
Close
Volume
Adj Close
High
Low
Open
Close
Volume
Adj Close
