# 

# Exploratory Data Analysis

## Common Library Imports

In [1]:
import pandas as pd
import numpy as np

## Load Carbon Emissions Dataset

In [2]:
""" Read in the CSV """
df = pd.read_csv("carbon_emissions.csv")

df

  df = pd.read_csv("carbon_emissions.csv")


Unnamed: 0,periodid,institutionid,reportedcurrencyisocode,tcprimarysectorid,fiscalyear,periodenddate,di_319380,di_319381,di_319382,di_319383,...,streetaddress3,streetaddress4,zipcode,yearfounded,monthfounded,dayfounded,officephonevalue,otherphonevalue,officefaxvalue,webpage
0,24CE2138-A28F-4545-8583-A372D989D03E,11485,USD,524100,2019,2019-12-31,0.000009,0.000,0.001115,0.048,...,,,14727-9200,1887.0,,,585 968 1000,,,www.alleganygroup.com
1,DA940B4C-BF2F-47C8-887C-5EC89E97DD72,11489,USD,524100,2023,2023-12-31,0.036167,0.000,4.013338,0.045,...,,,02919-4949,1835.0,,,401-275-3000,,401-275-3029,www.fmglobal.com
2,3C074168-9E8F-4C98-91C7-31AF6EE82731,11489,USD,524100,2021,2021-12-31,0.030702,0.000,4.206117,0.052,...,,,02919-4949,1835.0,,,401-275-3000,,401-275-3029,www.fmglobal.com
3,37FC949D-8B66-4075-898B-F3C779BDB8BE,11489,USD,524100,2022,2022-12-31,0.016191,0.000,1.796607,0.045,...,,,02919-4949,1835.0,,,401-275-3000,,401-275-3029,www.fmglobal.com
4,F1DC79E1-7706-4FAA-A79F-53E205FD7FA4,11489,USD,524100,2020,2020-12-31,0.025843,0.000,3.182442,0.049,...,,,02919-4949,1835.0,,,401-275-3000,,401-275-3029,www.fmglobal.com
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5988994,B5A463B3-38BF-428F-8D20-CB19A9E5B14A,119043496,USD,550000,2021,2021-12-31,0.000326,0.005,0.007645,0.109,...,,,,,,,,,,www.gecspa.it
5988995,6B0F9762-B31F-4162-8BD4-D8ED5C41906E,119043496,USD,550000,2020,2020-12-31,0.000202,0.005,0.004472,0.101,...,,,,,,,,,,www.gecspa.it
5988996,83E5B6F0-ACCC-4E11-8847-A4B994F3E367,119063782,USD,453000,2021,2021-12-31,0.000199,0.009,0.004160,0.196,...,,,08002,1979.0,,,34 933 17 49 66,,,raima.cat
5988997,5713CA92-E354-441A-A81C-4C84E2772D02,119063782,USD,453000,2020,2020-12-31,0.000144,0.009,0.002808,0.182,...,,,08002,1979.0,,,34 933 17 49 66,,,raima.cat


## Missing data checks to prepare for imputation

In [None]:
""" Missing Data Checks """

In [4]:
""" Number of unique companies across all the years """
np.unique(df["gvkey"]).size # roughly 30k

31511

In [10]:
""" Now determine the number of unique companies in the 
different years
"""
print(f"All years: {np.unique(df['fiscalyear'])}")

for year in np.unique(df['fiscalyear']):
    print(f"{year} => {np.unique(df[df['fiscalyear'] == year]['gvkey']).size}")

All years: [2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
 2016 2017 2018 2019 2020 2021 2022 2023]
2002 => 1763
2003 => 2000
2004 => 2885
2005 => 3880
2006 => 4170
2007 => 4307
2008 => 4269
2009 => 4563
2010 => 4723
2011 => 4833
2012 => 4868
2013 => 5757
2014 => 6154
2015 => 6235
2016 => 13882
2017 => 14786
2018 => 16979
2019 => 17378
2020 => 23353
2021 => 23386
2022 => 24053
2023 => 9725


In [None]:
# print(df[])

## Write out all the unique gvkeys

In [15]:
with open("wrds_environment_gvkeys.txt", "w") as fh:
    for gvkey in np.unique(df["gvkey"]):
        if not np.isnan(gvkey):
            fh.write(f"{int(gvkey)}\n")

## Read in the daily security prices csv

In [16]:
returns_df = pd.read_csv("daily.csv")

  returns_df = pd.read_csv("daily.csv")


In [17]:
returns_df

Unnamed: 0,gvkey,iid,datadate,conm,trfd,isin,county,sic
0,1166,01W,2002-01-02,ASM INTERNATIONAL NV,1.000000,NL0000334118,,3559.0
1,1166,01W,2002-01-03,ASM INTERNATIONAL NV,1.000000,NL0000334118,,3559.0
2,1166,01W,2002-01-04,ASM INTERNATIONAL NV,1.000000,NL0000334118,,3559.0
3,1166,01W,2002-01-07,ASM INTERNATIONAL NV,1.000000,NL0000334118,,3559.0
4,1166,01W,2002-01-08,ASM INTERNATIONAL NV,1.000000,NL0000334118,,3559.0
...,...,...,...,...,...,...,...,...
113958618,362779,01W,2024-08-06,NOVAMARINE SPA,1.145429,IT0005605701,,
113958619,362779,01W,2024-08-07,NOVAMARINE SPA,1.145429,IT0005605701,,
113958620,362779,01W,2024-08-08,NOVAMARINE SPA,1.145429,IT0005605701,,
113958621,362779,01W,2024-08-09,NOVAMARINE SPA,1.145429,IT0005605701,,


## Check the intersect between the gvkeys of the carbon emissions dataset and the daily prices dataset

In [31]:
# unique non-nan in environment
env_gvkeys = np.unique(df[~np.isnan(df["gvkey"])]["gvkey"])

print(env_gvkeys.size)

31510


In [32]:
# unique non-nan in returns
returns_gvkeys = np.unique(returns_df[~np.isnan(returns_df["gvkey"])]["gvkey"])

print(returns_gvkeys.size)

24110


In [33]:
env_ret_common_gvkeys = np.intersect1d(
    env_gvkeys, 
    returns_gvkeys, 
    assume_unique=True
)

print(env_ret_common_gvkeys.size)

24110


In [34]:
""" Check the missing gvkeys to see what country they are from """
missing_gvkeys = np.setdiff1d(env_gvkeys, env_ret_common_gvkeys, assume_unique=True)

print(missing_gvkeys.size)

7400


## Checks on Duplicates

In [15]:
company_dups = df[df.duplicated('gvkey', keep=False) == True].sort_values(by="gvkey")

In [26]:
last_iid = None
last_fyears = set()
for i, row in company_dups.iterrows():
    current_iid = row["institutionid"]
    current_fyear = row["fiscalyear"]
    if last_iid is not None and current_iid == last_iid:
        if current_fyear in last_fyears:
            raise Exception(f"Fyear clash!, iid: {current_iid}, prev_fyears: {last_fyears}, clash: {current_fyear}")
        else:
            last_fyears.add(current_fyear)
    else:
        last_iid = current_iid
        last_fyears.clear()
        last_fyears.add(current_fyear)

print("No issues...")

Exception: Fyear clash!, iid: 4415462, prev_fyears: {2022}, clash: 2022

In [31]:
company_dups.head(20)

Unnamed: 0,periodid,institutionid,reportedcurrencyisocode,tcprimarysectorid,fiscalyear,periodenddate,di_319380,di_319381,di_319382,di_319383,...,streetaddress3,streetaddress4,zipcode,yearfounded,monthfounded,dayfounded,officephonevalue,otherphonevalue,officefaxvalue,webpage
5,30D218CF-2E2A-46B4-AF72-CADF593290E8,4074603,USD,713A00,2022,01/01/2023,0.285,0.021,2.079,0.153,...,,,76011,1961.0,,,972 595 5000,,,www.sixflags.com
2503,1989BCA5-218F-4857-A3CE-1F2114D67F8E,4074603,USD,713A00,2023,31/12/2023,0.302,0.021,2.098,0.147,...,,,76011,1961.0,,,972 595 5000,,,www.sixflags.com
4988,5B249039-D1E2-43D2-825B-DDD233FE2FE0,4996548,USD,561300,2023,31/12/2023,0.015,0.005,0.125,0.04,...,,,75024,2007.0,,,972 692 2400,,,bgsf.com
40,5F205052-5493-41C8-9C8C-1E0FE1FC5DC0,4996548,USD,561300,2022,01/01/2023,0.014,0.005,0.122,0.041,...,,,75024,2007.0,,,972 692 2400,,,bgsf.com
1,01C53196-7DD9-42C7-9D3B-F152BFB3A364,4054841,USD,445000A,2022,01/01/2023,3.061,0.003,186.279,0.203,...,,,1506 MA,1867.0,,,31 88 659 9111,,,www.aholddelhaize.com
2462,793C84DE-3E9D-4CE0-8768-647BA85F9272,4054841,USD,445000A,2023,31/12/2023,3.183,0.003,181.084,0.189,...,,,1506 MA,1867.0,,,31 88 659 9111,,,www.aholddelhaize.com
110,A706AF20-7252-4C95-AA58-2481363942C5,10175068,USD,722000,2022,02/01/2023,0.095,0.053,0.696,0.389,...,,,33309,2011.0,,,954-618-2000,,,www.burgerfi.com
6071,96731D0E-B576-41E4-8386-DAD27811DF60,10175068,USD,722000,2023,01/01/2024,0.092,0.054,0.625,0.368,...,,,33309,2011.0,,,954-618-2000,,,www.burgerfi.com
99,0446986D-7007-4B72-AA5B-49AA5ADB4CF2,28295169,USD,541512,2022,01/01/2023,0.042,0.02,0.362,0.17,...,,,55425,2016.0,,,952 851 5200,,,www.skywatertechnology.com
6011,9AABC54A-2E82-4C6A-99E2-489EF08AE342,28295169,USD,541512,2023,31/12/2023,0.051,0.018,0.42,0.147,...,,,55425,2016.0,,,952 851 5200,,,www.skywatertechnology.com


In [32]:
company_dups[company_dups["institutionid"] == 4415462]

Unnamed: 0,periodid,institutionid,reportedcurrencyisocode,tcprimarysectorid,fiscalyear,periodenddate,di_319380,di_319381,di_319382,di_319383,...,streetaddress3,streetaddress4,zipcode,yearfounded,monthfounded,dayfounded,officephonevalue,otherphonevalue,officefaxvalue,webpage
23,5282F3CE-9CFA-4830-9116-3B36EFDE5089,4415462,USD,52A000,2022,01/01/2023,0.001,0.001,0.054,0.037,...,,,50059.0,1993.0,,,7 727 244 5484,,7 727 244 5480,www.homecredit.kz
24,5282F3CE-9CFA-4830-9116-3B36EFDE5089,4415462,USD,52A000,2022,01/01/2023,0.001,0.001,0.054,0.037,...,,,,,,,,,,


array(['United States', 'Netherlands', 'United Kingdom', 'Canada',
       'Kazakhstan', 'Belarus', 'Australia', 'Belgium', 'Austria',
       'Finland', 'Ireland', nan, 'Singapore', 'France', 'Denmark',
       'Japan', 'Israel', 'Italy', 'South Africa', 'Thailand', 'Germany',
       'China', 'Hong Kong', 'Luxembourg', 'India', 'Switzerland',
       'Malaysia', 'South Korea', 'Kenya', 'New Zealand', 'Spain',
       'Pakistan', 'Saudi Arabia', 'Sweden', 'British Virgin Islands',
       'Kuwait', 'Turkey', 'Philippines', 'Mauritius', 'Bangladesh',
       'Cayman Islands', 'Botswana', 'Egypt', 'Malta', 'Malawi',
       'Jamaica', 'Bermuda', 'Colombia', 'Mexico', 'Norway', 'Brazil',
       'Bahrain', 'Morocco', 'Indonesia', 'Romania', 'Russia',
       'Ivory Coast', 'Tunisia', 'Greece', 'Vietnam', 'Taiwan', 'Nigeria',
       'Oman', 'Qatar', 'Portugal', 'United Arab Emirates', 'Jersey',
       'Poland', 'Bulgaria', 'Chile', 'Reunion', 'Ghana', 'Monaco',
       'Bahamas', 'Guernsey'], dtype=o