# Data Assignment 1 - `t54zheng` (20939203)

## Task 2 - Basic Statistics

In [5]:
# imports

import pandas as pd
import scipy.stats as stats
from math import sqrt
import warnings
import matplotlib.pyplot as plt
from datetime import timedelta
import numpy as np
from math import sqrt
from scipy.optimize import minimize

warnings.filterwarnings('ignore')

In [6]:
# Import raw data
data_file = "djreturns.xlsx"

dj27 = pd.read_excel(data_file, sheet_name="dj27")
individual_dj27_returns = pd.read_excel(data_file, sheet_name="returns")
sp500_returns = pd.read_excel(data_file, sheet_name="sp500")

***

## Task 2 - *Basic Statistics*

For each of the 27 stocks in `dj27`, and the market return `sp500_returns`, we want to find these statistics on their returns:
- The arithmetic mean
- standard deviation
- skewness
- kurtosis

In [7]:
# We need to group the data in individual_dj27_returns based on stock.
# Note that using PERMNO is a better idea since COMNAM can change.

# Let's show that our data actually has this issue:
duplicate_comnam_df = individual_dj27_returns[["PERMNO", "COMNAM"]].drop_duplicates().groupby("PERMNO").agg({'COMNAM': lambda x: list(x)})
duplicate_comnam_df

Unnamed: 0_level_0,COMNAM
PERMNO,Unnamed: 1_level_1
10107,[MICROSOFT CORP]
10145,[HONEYWELL INTERNATIONAL INC]
11308,[COCA COLA CO]
12490,[INTERNATIONAL BUSINESS MACHS COR]
14008,[AMGEN INC]
14541,"[CHEVRON CORP, CHEVRONTEXACO CORP, CHEVRON COR..."
14593,"[APPLE COMPUTER INC, APPLE INC]"
18163,[PROCTER & GAMBLE CO]
18542,[CATERPILLAR INC]
19502,"[WALGREEN CO, WALGREENS BOOTS ALLIANCE INC]"


We see many securities have multiple comnames as they have changed their company name over the period of the data, but PERMNO remains the same.

In [8]:
# So let's make a new dataframe for each PERMNO we have in dj27, and store them in a dict by PERMNO.
returns_dict = {} # permno -> dataframe(permno_returns)
permnos = dj27["PERMNO"]
for permno in permnos:
    returns_df = individual_dj27_returns[individual_dj27_returns["PERMNO"] == permno]
    returns_dict[permno] = returns_df

In [9]:
# Now that we have our data nicely organized, let's make a new dataframe to present our statistics
# We'll have every row describes the statistics for each return

stats_df = pd.DataFrame(columns=["permno", "Common Name(s)", "Mean (%)", "Standard Deviation (%)", "Skewness", "Kurtosis"])

# add using .loc[-1]
# First add the stats for the market portfolio
market_stats = {
    "permno": "market",
    "Common Name(s)": ["Market"], 
    "Mean (%)": sp500_returns["SPRTRN"].mean(),
    "Standard Deviation (%)": sp500_returns["SPRTRN"].std(),
    "Skewness": stats.skew(sp500_returns["SPRTRN"]),
    "Kurtosis": stats.kurtosis(sp500_returns["SPRTRN"])
    }

# stats_df = stats_df.append(market_stats, ignore_index=True)
stats_df.loc[0] = [v for v in market_stats.values()]

# Now we'll add the rest of the securities from dj27
duplicate_comnam_dict = duplicate_comnam_df.to_dict()['COMNAM']

i = 1
for permno, df in returns_dict.items():
    permno_stats = {
        "permno": permno,
        "Common Name(s)": duplicate_comnam_dict[permno], 
        "Mean (%)": df["RET"].mean(),
        "Standard Deviation (%)": df["RET"].std(),
        "Skewness": stats.skew(df["RET"]),
        "Kurtosis": stats.kurtosis(df["RET"])
    }
    # stats_df = stats_df.append(permno_stats, ignore_index=True)
    stats_df.loc[i] = [v for v in permno_stats.values()]
    i += 1

# Now we need to annualize the mean and standard deviation of the returns (currently monthly)
stats_df["Mean (%)"] = stats_df["Mean (%)"] * 12 # Annualize by multiplying by 12 (no compounding)
stats_df["Standard Deviation (%)"] = stats_df["Mean (%)"] * sqrt(12) # Annualizing stdev

# Format the results
stats_df["Mean (%)"] *= 100
stats_df["Standard Deviation (%)"] *= 100

# Round to 4 decimal places
stats_df = stats_df.round(4)
stats_df

Unnamed: 0,permno,Common Name(s),Mean (%),Standard Deviation (%),Skewness,Kurtosis
0,market,[Market],6.5021,22.5238,-0.5312,1.0848
1,10107,[MICROSOFT CORP],14.1157,48.8983,0.216,3.3232
2,10145,[HONEYWELL INTERNATIONAL INC],12.5522,43.4822,-0.1321,7.7921
3,11308,[COCA COLA CO],7.5283,26.0787,-0.5014,1.1777
4,12490,[INTERNATIONAL BUSINESS MACHS COR],6.5405,22.657,0.4121,3.5562
5,14008,[AMGEN INC],10.327,35.7739,0.5216,1.8024
6,14541,"[CHEVRON CORP, CHEVRONTEXACO CORP, CHEVRON COR...",10.7444,37.2197,0.4071,2.27
7,14593,"[APPLE COMPUTER INC, APPLE INC]",33.2421,115.1539,-0.6328,3.4377
8,18163,[PROCTER & GAMBLE CO],9.3172,32.2759,-1.5059,9.1691
9,18542,[CATERPILLAR INC],17.3336,60.0455,-0.0672,1.9528


In [10]:
# The market portfolio
stats_df[stats_df["permno"] == "market"]

Unnamed: 0,permno,Common Name(s),Mean (%),Standard Deviation (%),Skewness,Kurtosis
0,market,[Market],6.5021,22.5238,-0.5312,1.0848


### Statistics of the market portfolio

- TODO: Comment on skewness and kurtosis of the market return vs a normal dist