# Data Assignment 1 - `t54zheng` (20939203)



In [167]:
import pandas as pd
import scipy.stats as stats
from math import sqrt
import warnings
warnings.filterwarnings('ignore')

In [137]:
# Import raw data
data_file = "djreturns.xlsx"

dj27 = pd.read_excel(data_file, sheet_name="dj27")
individual_dj27_returns = pd.read_excel(data_file, sheet_name="returns")
sp500_returns = pd.read_excel(data_file, sheet_name="sp500")

## Task 2 - *Basic Statistics*

For each of the 27 stocks in `dj27`, and the market return `sp500_returns`, we want to find these statistics on their returns:
- The arithmetic mean
- standard deviation
- skewness
- kurtosis

In [138]:
# Let's see what our data looks like
dj27.head()

Unnamed: 0,PERMNO,COMNAM,TICKER
0,10107,MICROSOFT CORP,MSFT
1,10145,HONEYWELL INTERNATIONAL INC,HON
2,11308,COCA COLA CO,KO
3,12490,INTERNATIONAL BUSINESS MACHS COR,IBM
4,14008,AMGEN INC,AMGN


In [139]:
individual_dj27_returns.head()

Unnamed: 0,PERMNO,DATE,COMNAM,TICKER,PRC,RET,SHROUT
0,10107,2000-01-31,MICROSOFT CORP,MSFT,97.875,-0.16167,5160025
1,10107,2000-02-29,MICROSOFT CORP,MSFT,89.375,-0.086845,5160025
2,10107,2000-03-31,MICROSOFT CORP,MSFT,106.25,0.188811,5242000
3,10107,2000-04-28,MICROSOFT CORP,MSFT,69.75,-0.343529,5262405
4,10107,2000-05-31,MICROSOFT CORP,MSFT,62.5625,-0.103047,5262405


In [140]:
# We need to group the data in individual_dj27_returns based on stock.
# Note that using PERMNO is a better idea since COMNAM can change.

# Let's show that our data actually has this issue:
duplicate_comnam_df = individual_dj27_returns[["PERMNO", "COMNAM"]].drop_duplicates().groupby("PERMNO").agg({'COMNAM': lambda x: list(x)})
duplicate_comnam_df

Unnamed: 0_level_0,COMNAM
PERMNO,Unnamed: 1_level_1
10107,[MICROSOFT CORP]
10145,[HONEYWELL INTERNATIONAL INC]
11308,[COCA COLA CO]
12490,[INTERNATIONAL BUSINESS MACHS COR]
14008,[AMGEN INC]
14541,"[CHEVRON CORP, CHEVRONTEXACO CORP, CHEVRON COR..."
14593,"[APPLE COMPUTER INC, APPLE INC]"
18163,[PROCTER & GAMBLE CO]
18542,[CATERPILLAR INC]
19502,"[WALGREEN CO, WALGREENS BOOTS ALLIANCE INC]"


We see many securities have multiple comnames as they have changed their company name over the period of the data, but PERMNO remains the same.

In [141]:
# So let's make a new dataframe for each PERMNO we have in dj27, and store them in a dict by PERMNO.
returns_dict = {} # permno -> dataframe(permno_returns)
permnos = dj27["PERMNO"]
for permno in permnos:
    returns_df = individual_dj27_returns[individual_dj27_returns["PERMNO"] == permno]
    returns_dict[permno] = returns_df

In [169]:
# Now that we have our data nicely organized, let's make a new dataframe to present our statistics
# We'll have every row describes the statistics for each return

stats_df = pd.DataFrame(columns=["permno", "Common Name(s)", "Mean (%)", "Standard Deviation (%)", "Skewness", "Kurtosis"])

# First add the stats for the market portfolio
market_stats = {
    "permno": "market",
    "Common Name(s)": ["Market"], 
    "Mean (%)": sp500_returns["SPRTRN"].mean(),
    "Standard Deviation (%)": sp500_returns["SPRTRN"].std(),
    "Skewness": stats.skew(sp500_returns["SPRTRN"]),
    "Kurtosis": stats.kurtosis(sp500_returns["SPRTRN"])
    }

stats_df = stats_df.append(market_stats, ignore_index=True)

# Now we'll add the rest of the securities from dj27
duplicate_comnam_dict = duplicate_comnam_df.to_dict()['COMNAM']

for permno, df in returns_dict.items():
    permno_stats = {
        "permno": permno,
        "Common Name(s)": duplicate_comnam_dict[permno], 
        "Mean (%)": df["RET"].mean(),
        "Standard Deviation (%)": df["RET"].std(),
        "Skewness": stats.skew(df["RET"]),
        "Kurtosis": stats.kurtosis(df["RET"])
    }
    stats_df = stats_df.append(permno_stats, ignore_index=True)

# Now we need to annualize the mean and standard deviation of the returns (currently monthly)
stats_df["Mean (%)"] = (1 + stats_df["Mean (%)"]) ** 12 - 1
stats_df["Standard Deviation (%)"] = stats_df["Mean (%)"] * sqrt(12) # Annualizing stdev

# Format the results
stats_df["Mean (%)"] *= 100
stats_df["Standard Deviation (%)"] *= 100

# Round to 4 decimal places
stats_df = stats_df.round(4)
stats_df

Unnamed: 0,permno,Common Name(s),Mean (%),Standard Deviation (%),Skewness,Kurtosis
0,market,[Market],6.6994,23.2074,-0.5312,1.0848
1,10107,[MICROSOFT CORP],15.0657,52.1892,0.216,3.3232
2,10145,[HONEYWELL INTERNATIONAL INC],13.3002,46.0731,-0.1321,7.7921
3,11308,[COCA COLA CO],7.7935,26.9976,-0.5014,1.1777
4,12490,[INTERNATIONAL BUSINESS MACHS COR],6.7402,23.3487,0.4121,3.5562
5,14008,[AMGEN INC],10.8301,37.5167,0.5216,1.8024
6,14541,"[CHEVRON CORP, CHEVRONTEXACO CORP, CHEVRON COR...",11.2896,39.1084,0.4071,2.27
7,14593,"[APPLE COMPUTER INC, APPLE INC]",38.805,134.4243,-0.6328,3.4377
8,18163,[PROCTER & GAMBLE CO],9.7256,33.6905,-1.5059,9.1691
9,18542,[CATERPILLAR INC],18.7792,65.0532,-0.0672,1.9528


In [145]:
stats.kurtosis(sp500_returns["SPRTRN"])

1.084770190991489

In [146]:
duplicate_comnam_df.to_dict()['COMNAM']

{10107: ['MICROSOFT CORP'],
 10145: ['HONEYWELL INTERNATIONAL INC'],
 11308: ['COCA COLA CO'],
 12490: ['INTERNATIONAL BUSINESS MACHS COR'],
 14008: ['AMGEN INC'],
 14541: ['CHEVRON CORP', 'CHEVRONTEXACO CORP', 'CHEVRON CORP NEW'],
 14593: ['APPLE COMPUTER INC', 'APPLE INC'],
 18163: ['PROCTER & GAMBLE CO'],
 18542: ['CATERPILLAR INC'],
 19502: ['WALGREEN CO', 'WALGREENS BOOTS ALLIANCE INC'],
 19561: ['BOEING CO'],
 22111: ['JOHNSON & JOHNSON'],
 22592: ['MINNESOTA MINING & MFG CO', '3M CO'],
 22752: ['MERCK & CO INC', 'MERCK & CO INC NEW'],
 26403: ['DISNEY WALT CO'],
 43449: ['MCDONALDS CORP'],
 47896: ['CHASE MANHATTAN CORP NEW',
  'J P MORGAN CHASE & CO',
  'JPMORGAN CHASE & CO'],
 55976: ['WAL MART STORES INC', 'WALMART INC'],
 57665: ['NIKE INC'],
 59176: ['AMERICAN EXPRESS CO'],
 59328: ['INTEL CORP'],
 59459: ['ST PAUL COS INC',
  'ST PAUL TRAVELERS COS INC',
  'TRAVELERS COMPANIES INC'],
 65875: ['BELL ATLANTIC CORP', 'VERIZON COMMUNICATIONS INC'],
 66181: ['HOME DEPOT INC'],
