# Imports

In [1]:
import pandas as pd
import numpy as np

In [2]:
import statsmodels.formula.api as smf

# Data Sample
> 2014-2024; 

> NA & Global Firms;

> Interested in returns from 2016-01 onwards

## Emissions Data

### Load

In [3]:
""" Read in the CSV """
raw_emissions_df = pd.read_csv(
    "emissions_2014to2024.csv", 
    dtype={"companyid": "str", "gvkey": "str"}, #identifiers
    parse_dates=["periodenddate"], 
)

raw_emissions_df

Unnamed: 0,institutionid,fiscalyear,periodenddate,di_319413,di_319414,di_319415,gvkey,companyname,country
0,11485,2019,2019-12-31,1.171531,4.551169,56.416747,,Allegany Co-op Insurance Company,United States
1,11489,2023,2023-12-31,4154.828402,12742.671094,176205.682880,,Factory Mutual Insurance Company,United States
2,11489,2021,2021-12-31,3809.540653,16339.416941,203773.712750,,Factory Mutual Insurance Company,United States
3,11489,2022,2022-12-31,1859.947024,5704.373536,78880.089343,,Factory Mutual Insurance Company,United States
4,11489,2020,2020-12-31,3356.875454,12529.659338,159075.046820,,Factory Mutual Insurance Company,United States
...,...,...,...,...,...,...,...,...,...
5929322,119043496,2021,2021-12-31,47.475921,146.879648,280.011919,,Geo Engineering Consulting,Italy
5929323,119043496,2020,2020-12-31,30.736150,87.774611,170.360967,,Geo Engineering Consulting,Italy
5929324,119063782,2021,2021-12-31,25.679523,90.062691,149.618703,,"LA CARPETA I EL PAPER, SA",Spain
5929325,119063782,2020,2020-12-31,19.478006,61.830186,105.704910,,"LA CARPETA I EL PAPER, SA",Spain


### EDA (Can be skipped)

**Inspect the loaded datatypes**

In [1514]:
print(raw_emissions_df["gvkey"].isnull().sum())
print((raw_emissions_df["gvkey"] == "").sum())

1769291
0


In [1511]:
raw_emissions_df.dtypes

institutionid             int64
fiscalyear                int64
periodenddate    datetime64[ns]
di_319413               float64
di_319414               float64
di_319415               float64
companyid                object
gvkey                    object
companyname              object
country                  object
dtype: object

**The company ids are extracted and written out to a file**

In [680]:
""" Utility functions regarding validity, uniqueness, and writing out 
unique and valid ids. """

def keep_valid(data, colname=None):
    """ Only keeps the rows/items from the given dataframe/array-like which 
    have valid values - for the specified column in the case of a dataframe.

    Args:
        data: Dataframe or array-like.
        colname: Name of the dataframe column whose valid values we are using to 
            filter.

    Returns:
        The dataframe/array-like with rows/items that have invalid values, w.r.t 
        the specified column if applicable, filtered out.
    """

    # Array whose values we are interested in
    col = data[colname] if colname is not None else data
    # Values are considered valid as long as they are not NaN
    return data[pd.notnull(col)]


def extract_unique(df, colname):
    """ Extract the unique and non-NaN values from a dataframe column.
    
    Args:
        df: Dataframe.
        colname: Column name w.r.t the dataframe.

    Returns:
        Unique and non-NaN column values.
    """

    return keep_valid(pd.unique(df[colname]))


def write_ids(df, idname, filename):
    """ Writes the unique, non-NaN instances of the indicated identifier, within the 
    indicated dataframe, on separate lines of a new file, whose filename should 
    be specified.

    Args:
        df: Dataframe.
        idname: Column name of the identifier with respect to the dataframe.
        filename: The name to use for the newly created file.
    """
    with open(filename, "w") as fh:
        for idval in extract_unique(df, idname):
            fh.write(f"{idval}\n")

In [4]:
""" Export CIQ company ids """
write_ids(raw_emissions_df, "companyid", "companyids.txt")

**The number of unique company ids and gvkeys are reported for this raw emissions data**

In [681]:
""" Utility function """
def report_unique(df, colname):
    """ Reports unique, non-NaN values of a dataframe column
    
    Args:
        df: Dataframe.
        colname: Column name.
    """

    print(f"Number of unique, non-null values of \'{colname}\': {len(extract_unique(df, colname))}")

In [682]:
print("-- Raw Emissions Data")
report_unique(raw_emissions_df, "institutionid")
report_unique(raw_emissions_df, "companyid")
report_unique(raw_emissions_df, "gvkey")

-- Raw Emissions Data
Number of unique, non-null values of 'institutionid': 1720932
Number of unique, non-null values of 'companyid': 1725359
Number of unique, non-null values of 'gvkey': 24304


**Check whether every gvkey entry has a corresponding company id (non-null -> non-null)**

In [684]:
def is_backed(df, src_colname, ref_colname):
    """ Check if one column is backed by another in a dataframe. This means that 
    when the first column is non-null, the second column cannot be null as 
    otherwise it would be backing the first column.

    Args:
        df: Dataframe.
        src_colname: Name of column that should be backed.
        ref_colname: Name of backing column.
    
    Returns:
        True/False depending on whether the first column is backed by the second
    """

    # check for 0 invalid cases for valid backing
    return (df[src_colname].notnull() & df[ref_colname].isnull()).sum() == 0

def report_backed(df, src_colname, ref_colname):
    """ Reports whether every non-null value of src is backed by a non-null 
    value of ref.
    
    Args:
        df: Dataframe.
        src_colname: Name of column that should be backed.
        ref_colname: Name of backing column.
    """
    print(f"Is \'{src_colname}\' backed by \'{ref_colname}\': {'YES' if is_backed(df, src_colname, ref_colname) else 'NO'}")

In [685]:
print("-- Raw Emissions Data")
report_backed(
    raw_emissions_df, "gvkey", "companyid"
)
report_backed(
    raw_emissions_df, "companyid", "institutionid"
)

-- Raw Emissions Data
Is 'gvkey' backed by 'companyid': YES
Is 'companyid' backed by 'institutionid': YES


**Check for duplicates and null values in the 3 identifiers - companyid, gvkey and institutionid**

In [686]:
""" Utilty functions """
def report_null(df, colname):
    """ Indicate the number of nulls in the specified column in the 
    given dataframe.
    
    Args:
        df: Dataframe.
        src_colname: Name of column that we are interested in.
    """
    print(f"Number of nulls in \'{colname}\': {df[colname].isnull().sum()}")

def report_dup(df, colname):
    """ Indicate the presence of duplicates in the specified column in the 
    given dataframe.
    
    Args:
        df: Dataframe.
        src_colname: Name of column that we are interested in.
    """

    print(f"Are there duplicates in \'{colname}\': {'YES' if df[colname].duplicated().any() else 'NO'}")

In [687]:
print("-- Raw Emissions Data")
print("> Null checks")
report_null(raw_emissions_df, "companyid")
report_null(raw_emissions_df, "institutionid")
report_null(raw_emissions_df, "gvkey")
print("> Dup checks")
report_dup(raw_emissions_df, "companyid")
report_dup(raw_emissions_df, "institutionid")
report_dup(raw_emissions_df, "gvkey")

-- Raw Emissions Data
> Null checks
Number of nulls in 'companyid': 747
Number of nulls in 'institutionid': 0
Number of nulls in 'gvkey': 1769291
> Dup checks
Are there duplicates in 'companyid': YES
Are there duplicates in 'institutionid': YES
Are there duplicates in 'gvkey': YES


**Utilities for numerical value conflicts on grouping by a key. Check for one to one mapping conflicts between identifiers.**

In [688]:
""" Utility functions """

# aggregates series values, NaN if all equal, else string representation with space delimiter
# TODO: assumes no NaN in vals and also the existence of at least 1 element due 
# to the retrieval at index 0.
numer_conflict_aggr = lambda vals: np.nan if (vals.iloc[0] == vals).all() else " ".join(vals.astype(str))

def detect_numerical_conflicts(df, key, numer_cols):
    """ Detects numerical conflicts when grouping on a dataframe by the given 
    keys - filling in the multiple conflict values for inspection.
    
    Args:
        df: Dataframe.
        key: Key or keys that are used for grouping.
        numer_cols: Name of numerical columns which are checked for conflicts
    
    Returns:
        Dataframe with values as NaN if no conflicts, else the conflict values 
        delimited by spaces are filled in.
    """

    return df.groupby(key).agg(
        {
            numer_col : numer_conflict_aggr for numer_col in numer_cols
        }
    )

def one_to_one_conflict_aggr(vals):
    """ Aggregator for the 1t1 conflicts, NaN indicates valid as 1 mapped, else 
    the conflicting values of the second identifier are returned 
    (nothing or multiple).
    
    Args:
        vals: Series that is aggregated.
    Returns:
        NaN if valid, else the conflict values joined together in a string, 
        separated by spaces.
    """
    # simple short-circuit on apparent validity
    if vals.count() == 1:
        return np.nan
    # checking if there is conflict by there being no mapping
    if vals.count() == 0:
        return ""

    # otherwise, we have multiple entries (>1)
    vals = vals.dropna() # remove null bloat to leave just the entries

    if (vals.iloc[0] == vals).all(): # reduces to 1 actual mapping, hence valid
        return np.nan
    else: # otherwise, multiple actual mappings, which creates conflicts
        return " ".join(vals.astype(str))

def detect_1t1_conflicts(df, key, oth_id):
    """ Detects one to one mapping conflicts when grouping on a dataframe by the 
    first identifier and seeing how many of the second identifier this maps to.
    
    Args:
        df: Dataframe.
        key: Key for the first identifier.
        oth_id: Name of the column for the second identifier
    
    Returns:
        Dataframe with values as NaN if no conflicts, else the conflict values, 
        for the second identifier, delimited by spaces are filled in.
    """

    return df.groupby(key).agg(
        {
            oth_id : one_to_one_conflict_aggr, 
        }
    )

**Numerical value conflicts in environmental metrics when grouping by gvkey and 
fiscal year.**

In [689]:
raw_emissions_gvkey_numer_conflicts = detect_numerical_conflicts(
    raw_emissions_df, 
    ["gvkey", "fiscalyear", "periodenddate"], 
    ["di_319413", "di_319414", "di_319415"]
)

raw_emissions_gvkey_numer_conflicts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,di_319413,di_319414,di_319415
gvkey,fiscalyear,periodenddate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
001004,2022,2022-05-31,,,
001045,2022,2022-12-31,,,
001045,2023,2023-12-31,,,
001050,2022,2022-12-31,,,
001050,2023,2023-12-31,,,
...,...,...,...,...,...
362683,2022,2022-03-31,,,
362705,2022,2022-12-31,,,
362758,2022,2022-03-31,,,
362761,2022,2022-03-31,,,


In [690]:
raw_emissions_gvkey_numer_conflicts[
    pd.notnull(raw_emissions_gvkey_numer_conflicts["di_319413"])
    | pd.notnull(raw_emissions_gvkey_numer_conflicts["di_319414"])
    | pd.notnull(raw_emissions_gvkey_numer_conflicts["di_319415"])
]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,di_319413,di_319414,di_319415
gvkey,fiscalyear,periodenddate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
003413,2022,2022-12-31,2867127.0 51062759.241,1018275.0 3933.0872579,1650402.398 2024652.4121
003897,2022,2022-12-31,27114550.031 25358937.262,197000.0 1953.2613298,4025432.601 1005488.8192
004094,2022,2022-12-31,810.06 1410.524 7995.579,7696.0 2956.39 9723.273,197193.363 84877.747 39817.706
005180,2022,2022-12-31,636360.11 53101.342601,251038.0 32063.589498,2436669.603 254265.62927
005600,2022,2022-12-31,1080082.1763 4073314.057,61712.663909 73116.0,850262.94718 353150.506
...,...,...,...,...,...
275535,2022,2022-03-31,734.626 112.41037212,4498.412 111.2256901,8751.123 2720.5732053
275535,2023,2023-03-31,611.138 80.405872624,3819.836 79.558482923,6836.914 1945.9953604
289724,2022,2022-12-31,101.653 0.0053786782,1642.732 0.0869208615,7071.309 0.3741598245
295786,2022,2022-12-31,134627.057 0.9207597207,129177.195 3.5451698548,5009667.962 64.639735408


It is evident from the above that the gvkeys are mapping to multiple company ids in 
the same period and this is leading to multiple instances of the environment metrics

**Gvkey <-> Companyid Mapping**

- *gvkey -> company id*

In [691]:
raw_emissions_gvkey_companyid_1t1_conflicts = detect_1t1_conflicts(
    raw_emissions_df, 
    "gvkey", "companyid"
)

raw_emissions_gvkey_companyid_1t1_conflicts

Unnamed: 0_level_0,companyid
gvkey,Unnamed: 1_level_1
001004,
001045,
001050,
001075,
001076,
...,...
362620,
362683,
362705,
362758,


In [692]:
raw_emissions_gvkey_companyid_1t1_conflicts[
    pd.notnull(raw_emissions_gvkey_companyid_1t1_conflicts["companyid"])
]

Unnamed: 0_level_0,companyid
gvkey,Unnamed: 1_level_1
002856,259777 259777 3103613 3103613
003413,263295 6020983 6020983
003897,266598 1067744 1067744
004094,528325 27755 386000 386000
005180,275442 275442 5546439
...,...
275535,109366051 109366051 881331 881331
275839,53336883 5482350
289724,215005509 215005509 30941174
295786,102326862 215788931


Notice that gvkey can map to multiple companyids, but remember that it 
maps to at least 1 company id because it backed by companyid.

- *company id -> gvkey*

In [693]:
raw_emissions_companyid_gvkey_1t1_conflicts = detect_1t1_conflicts(
    raw_emissions_df, 
    "companyid", "gvkey"
)

raw_emissions_companyid_gvkey_1t1_conflicts

Unnamed: 0_level_0,gvkey
companyid,Unnamed: 1_level_1
100000307,
100013,
1000212,
1000277,
10004497,
...,...
99996472,
99996476,
99996998,
99997106,


In [694]:
raw_emissions_companyid_gvkey_1t1_conflicts[
    pd.notnull(raw_emissions_companyid_gvkey_1t1_conflicts["gvkey"]) 
    & (raw_emissions_companyid_gvkey_1t1_conflicts["gvkey"] == "")
]

Unnamed: 0_level_0,gvkey
companyid,Unnamed: 1_level_1
100000307,
1000212,
1000277,
10004521,
10005029,
...,...
99996472,
99996476,
99996998,
99997106,


In [695]:
raw_emissions_companyid_gvkey_1t1_conflicts[
    pd.notnull(raw_emissions_companyid_gvkey_1t1_conflicts["gvkey"]) 
    & (raw_emissions_companyid_gvkey_1t1_conflicts["gvkey"] != "")
]

Unnamed: 0_level_0,gvkey
companyid,Unnamed: 1_level_1
104422749,037090 184982
106683684,285220 316585
1067744,003897 065089 003897 065089
1073371,100557 220942 100557 220942
112732,024616 145471 024616 145471
...,...
9683016,220688 247655
983017,007824 145270 007824 145270
9833116,242985 321595
99505,015863 214881 015863 214881


Notice that companyid can both map to no or multiple gvkeys

**Institutionid <-> Companyid Mapping**

- *companyid -> institutionid*

In [696]:
raw_emissions_companyid_institutionid_1t1_conflicts = detect_1t1_conflicts(
    raw_emissions_df, 
    "companyid", "institutionid"
)

raw_emissions_companyid_institutionid_1t1_conflicts

Unnamed: 0_level_0,institutionid
companyid,Unnamed: 1_level_1
100000307,
100013,
1000212,
1000277,
10004497,
...,...
99996472,
99996476,
99996998,
99997106,


In [697]:
raw_emissions_companyid_institutionid_1t1_conflicts[
    pd.notnull(raw_emissions_companyid_institutionid_1t1_conflicts["institutionid"])
]

Unnamed: 0_level_0,institutionid
companyid,Unnamed: 1_level_1


Hence, we can see companyid maps to exactly one institutionid.

- *institutionid -> companyid*

In [698]:
raw_emissions_institutionid_companyid_1t1_conflicts = detect_1t1_conflicts(
    raw_emissions_df, 
    "institutionid", "companyid"
)

raw_emissions_institutionid_companyid_1t1_conflicts

Unnamed: 0_level_0,companyid
institutionid,Unnamed: 1_level_1
11489,6520204 24951392 43964734 6520204 24951392 439...
11654,
11679,8333444 24586834 8333444 24586834
11894,7925667 34873695 7925667 34873695
12062,
...,...
118526590,
118706040,
118918440,
118991426,


In [699]:
raw_emissions_institutionid_companyid_1t1_conflicts[
    pd.notnull(raw_emissions_institutionid_companyid_1t1_conflicts["companyid"])
]

Unnamed: 0_level_0,companyid
institutionid,Unnamed: 1_level_1
11489,6520204 24951392 43964734 6520204 24951392 439...
11679,8333444 24586834 8333444 24586834
11894,7925667 34873695 7925667 34873695
13644,26465936 60478967 26465936 60478967
13959,246652 6167099 242354247 246652 6167099 242354247
...,...
109987465,
110365423,
111445983,
111768063,


Here, we can see that while a company belongs to exactly one institution, an 
institution maps to many companys (none, one or potentially many).

**Gvkey <-> Institutionid Mapping**

- *gvkey -> institutionid*

In [700]:
raw_emissions_gvkey_institutionid_1t1_conflicts = detect_1t1_conflicts(
    raw_emissions_df, 
    "gvkey", "institutionid"
)

raw_emissions_gvkey_institutionid_1t1_conflicts

Unnamed: 0_level_0,institutionid
gvkey,Unnamed: 1_level_1
001004,
001045,
001050,
001075,
001076,
...,...
362620,
362683,
362705,
362758,


In [701]:
raw_emissions_gvkey_institutionid_1t1_conflicts[
    pd.notnull(raw_emissions_gvkey_institutionid_1t1_conflicts["institutionid"])
]

Unnamed: 0_level_0,institutionid
gvkey,Unnamed: 1_level_1
002856,4057039 4057039 4057076 4057076
003413,4057041 4057080 4057080
003897,4057044 4057083 4057083
004094,108462 4021861 4388004 4388004
005180,4806213 4806213 5053995
...,...
275535,4265945 4265945 4326804 4326804
275839,4295672 6343205
289724,4772912 4772912 6393031
295786,4996309 6523164


Noting that there are no null values in institutionid, this means that gvkey 
corresponds to at least one, and potentially many, institutions.

- *institutionid -> gvkey*

In [702]:
raw_emissions_institutionid_gvkey_1t1_conflicts = detect_1t1_conflicts(
    raw_emissions_df, 
    "institutionid", "gvkey"
)

raw_emissions_institutionid_gvkey_1t1_conflicts

Unnamed: 0_level_0,gvkey
institutionid,Unnamed: 1_level_1
11489,
11654,
11679,
11894,
12062,
...,...
118526590,
118706040,
118918440,
118991426,


In [703]:
raw_emissions_institutionid_gvkey_1t1_conflicts[
    pd.notnull(raw_emissions_institutionid_gvkey_1t1_conflicts["gvkey"]) 
    & (raw_emissions_institutionid_gvkey_1t1_conflicts["gvkey"] == "")
]

Unnamed: 0_level_0,gvkey
institutionid,Unnamed: 1_level_1
11489,
11654,
11679,
11894,
12062,
...,...
118526590,
118706040,
118918440,
118991426,


In [704]:
raw_emissions_institutionid_gvkey_1t1_conflicts[
    pd.notnull(raw_emissions_institutionid_gvkey_1t1_conflicts["gvkey"]) 
    & (raw_emissions_institutionid_gvkey_1t1_conflicts["gvkey"] != "")
]

Unnamed: 0_level_0,gvkey
institutionid,Unnamed: 1_level_1
100165,002001 002002
100259,004708 027665 004708 027665
100391,008119 017095 008119 017095
101674,005849 039571
103042,012124 027867
...,...
6618361,204440 247501
6626040,278266 340246
6932029,100787 326859
9159619,221031 326164


We can see that institutionids map to many gvkeys (can be zero, one or multiple)

**Determine the number of gvkey identifiers that exist across all fiscal years and also the number that exist in each fiscal year**

First check whether `fiscalyear` is ever null

In [705]:
print("Is fiscalyear ever null:")
print('YES' if raw_emissions_df["fiscalyear"].isnull().any() else 'NO')

Is fiscalyear ever null:
NO


Check if there are any duplicate fiscal year entries for gvkeys

In [706]:
raw_emissions_gvkey_fiscalyear_dup = raw_emissions_df.groupby("gvkey").agg(
    {
        "fiscalyear": lambda vals: vals.duplicated().any(),
    }
)

raw_emissions_gvkey_fiscalyear_dup[
    raw_emissions_gvkey_fiscalyear_dup["fiscalyear"]
]

Unnamed: 0_level_0,fiscalyear
gvkey,Unnamed: 1_level_1
002856,True
003413,True
003897,True
004094,True
005180,True
...,...
271134,True
275535,True
289724,True
295786,True


We can see there are, and this is due to the duplication from the varying 
`companyid`'s and `institutionid`'s

### Data Preparation
> Mainly involves filtering

In [4]:
# cid_gvkey_df = pd.read_csv("cid_gvkey_map.csv")
cid_gvkey_df = pd.concat(
    [pd.read_csv(f"p{i}_gvkey_cids.csv") for i in range(7)],
    ignore_index=True
)

cid_gvkey_df

Unnamed: 0,companyid,gvkey,startdate,enddate,companyname
0,18511,210835,B,E,3i Group plc
1,18527,210418,B,E,ABB Ltd
2,18671,29751,B,E,Albemarle Corporation
3,18711,28349,B,E,The Allstate Corporation
4,18749,64768,B,E,"Amazon.com, Inc."
...,...,...,...,...,...
32326,1856268950,50370,B,E,Sucro Limited
32327,1859487646,359029,B,E,KET Inc.
32328,1863445043,362169,B,E,Chaosua Foods Industry Public Company Limited
32329,1866194559,42972,2024-01-01,E,"Eco Bright Future, Inc."


Inspect the loaded datatypes

In [5]:
cid_gvkey_df.dtypes

companyid       int64
gvkey           int64
startdate      object
enddate        object
companyname    object
dtype: object

In [6]:
print("-- Null Values:")
cid_gvkey_df.isnull().sum()

-- Null Values:


companyid      0
gvkey          0
startdate      0
enddate        0
companyname    0
dtype: int64

Filter the mappings to just those that are one-to-one, and keep note of these.

In [7]:
cid_gvkey_1t1_df = cid_gvkey_df[(cid_gvkey_df["startdate"] == "B") & (cid_gvkey_df["enddate"] == "E")]

cid_gvkey_1t1_df

Unnamed: 0,companyid,gvkey,startdate,enddate,companyname
0,18511,210835,B,E,3i Group plc
1,18527,210418,B,E,ABB Ltd
2,18671,29751,B,E,Albemarle Corporation
3,18711,28349,B,E,The Allstate Corporation
4,18749,64768,B,E,"Amazon.com, Inc."
...,...,...,...,...,...
32325,1855399529,358653,B,E,"SEIYU KOGYO Co.,Ltd."
32326,1856268950,50370,B,E,Sucro Limited
32327,1859487646,359029,B,E,KET Inc.
32328,1863445043,362169,B,E,Chaosua Foods Industry Public Company Limited


*Another approach for filtering mappings is considering where gvkey is not 
duplicated, as opposed to mappings that exist for all of time. This alternative 
approach may however lead to cases where the same company exists under 
different gvkeys (same companyid corresponds to multiple gvkeys). This was not 
chosen in our case.*

Verify that these mappings are indeed one-to-one.

In [8]:
print(f"Gvkey duplicates: {cid_gvkey_1t1_df['gvkey'].duplicated().any()}")
print(f"Companyid duplicates: {cid_gvkey_1t1_df['companyid'].duplicated().any()}")

Gvkey duplicates: False
Companyid duplicates: False


The lack of duplicates on both sides indicates that the mappings are indeed one to one.

>Notice filtering to one-to-one mappings reduces the number of unique gvkeys from ~24k to ~23k in the join table.

**Filter the main environment table using this join table, remembering the following facts:**
The raw main environment table has:
- ~1.8 million rows
- ~1.7 million unique companyid/institutionid values
- ~24k unique gvkey values

From the main environment table, filter entries to just those with gvkeys.

In [9]:
emissions_df = raw_emissions_df[pd.notnull(raw_emissions_df["gvkey"])]
emissions_df = emissions_df.copy()

emissions_df

Unnamed: 0,institutionid,fiscalyear,periodenddate,di_319413,di_319414,di_319415,gvkey,companyname,country
15,11555,2021,2021-12-31,6896.779446,29580.824880,3.689112e+05,122954,"American Family Mutual Insurance Company, S.I.",United States
16,11555,2020,2020-12-31,6711.648272,25051.470514,3.180505e+05,122954,"American Family Mutual Insurance Company, S.I.",United States
161,13959,2022,2022-12-31,18853.165084,72589.700681,1.323541e+06,122594,State Farm Mutual Automobile Insurance Company,United States
162,13959,2023,2023-12-31,23250.617734,89521.063147,1.632254e+06,122594,State Farm Mutual Automobile Insurance Company,United States
163,13959,2021,2021-12-31,18241.016543,94615.486662,1.580437e+06,122594,State Farm Mutual Automobile Insurance Company,United States
...,...,...,...,...,...,...,...,...,...
5929180,113580486,2019,2019-12-31,17592.373778,1748.093285,5.985221e+03,050370,Sucro Limited,United States
5929209,114230580,2022,2022-03-31,2.811856,0.177205,1.435370e+00,359029,KET Inc.,Japan
5929246,114764179,2022,2022-12-31,1641.238617,1117.543920,4.313828e+04,362169,Chaosua Foods Industry Public Company Limited,Thailand
5929316,118989678,2021,2021-12-31,162.790078,201.931404,5.448875e+03,362779,Novamarine S.p.A.,Italy


In [10]:
emissions_df.dtypes

institutionid             int64
fiscalyear                int64
periodenddate    datetime64[ns]
di_319413               float64
di_319414               float64
di_319415               float64
gvkey                    object
companyname              object
country                  object
dtype: object

In [11]:
emissions_df["gvkey"] = emissions_df["gvkey"].astype(int)
emissions_df.dtypes

institutionid             int64
fiscalyear                int64
periodenddate    datetime64[ns]
di_319413               float64
di_319414               float64
di_319415               float64
gvkey                     int64
companyname              object
country                  object
dtype: object

**Further filter to just the gvkeys that 1 to 1 map with companyid.** 
This should leave one entry per fiscal year for each gvkey. 

*Note some gvkeys may not have an entry for certain fiscal years because that data is simply missing.*

In [12]:
emissions_df = emissions_df[
    np.isin(emissions_df["gvkey"], cid_gvkey_1t1_df["gvkey"])
]

emissions_df

Unnamed: 0,institutionid,fiscalyear,periodenddate,di_319413,di_319414,di_319415,gvkey,companyname,country
15,11555,2021,2021-12-31,6896.779446,29580.824880,3.689112e+05,122954,"American Family Mutual Insurance Company, S.I.",United States
16,11555,2020,2020-12-31,6711.648272,25051.470514,3.180505e+05,122954,"American Family Mutual Insurance Company, S.I.",United States
161,13959,2022,2022-12-31,18853.165084,72589.700681,1.323541e+06,122594,State Farm Mutual Automobile Insurance Company,United States
162,13959,2023,2023-12-31,23250.617734,89521.063147,1.632254e+06,122594,State Farm Mutual Automobile Insurance Company,United States
163,13959,2021,2021-12-31,18241.016543,94615.486662,1.580437e+06,122594,State Farm Mutual Automobile Insurance Company,United States
...,...,...,...,...,...,...,...,...,...
5929180,113580486,2019,2019-12-31,17592.373778,1748.093285,5.985221e+03,50370,Sucro Limited,United States
5929209,114230580,2022,2022-03-31,2.811856,0.177205,1.435370e+00,359029,KET Inc.,Japan
5929246,114764179,2022,2022-12-31,1641.238617,1117.543920,4.313828e+04,362169,Chaosua Foods Industry Public Company Limited,Thailand
5929316,118989678,2021,2021-12-31,162.790078,201.931404,5.448875e+03,362779,Novamarine S.p.A.,Italy


Here, we check whether the above statement of exactly one entry per gvkey, 
fiscal year is true (barring the exception of missing data).

In [13]:
emissions_gvkey_fiscalyear_entries = emissions_df.groupby(["gvkey", "fiscalyear"]).size()

emissions_gvkey_fiscalyear_entries

gvkey   fiscalyear
1004    2016          1
        2017          1
        2018          1
        2019          1
        2020          1
                     ..
362761  2021          1
        2022          1
        2023          1
362779  2020          1
        2021          1
Length: 147049, dtype: int64

In [14]:
emissions_gvkey_fiscalyear_entries[
    emissions_gvkey_fiscalyear_entries != 1
]

Series([], dtype: int64)

The empty return for a number of entries different than 1 shows that 
we have indeed made gvkey, fiscal year unique.


In [15]:
emissions_df.nunique()["gvkey"]

29403

We can see that this process has reduced the number of unique `gvkey`'s to 
~19k now.

In [16]:
emissions_df

Unnamed: 0,institutionid,fiscalyear,periodenddate,di_319413,di_319414,di_319415,gvkey,companyname,country
15,11555,2021,2021-12-31,6896.779446,29580.824880,3.689112e+05,122954,"American Family Mutual Insurance Company, S.I.",United States
16,11555,2020,2020-12-31,6711.648272,25051.470514,3.180505e+05,122954,"American Family Mutual Insurance Company, S.I.",United States
161,13959,2022,2022-12-31,18853.165084,72589.700681,1.323541e+06,122594,State Farm Mutual Automobile Insurance Company,United States
162,13959,2023,2023-12-31,23250.617734,89521.063147,1.632254e+06,122594,State Farm Mutual Automobile Insurance Company,United States
163,13959,2021,2021-12-31,18241.016543,94615.486662,1.580437e+06,122594,State Farm Mutual Automobile Insurance Company,United States
...,...,...,...,...,...,...,...,...,...
5929180,113580486,2019,2019-12-31,17592.373778,1748.093285,5.985221e+03,50370,Sucro Limited,United States
5929209,114230580,2022,2022-03-31,2.811856,0.177205,1.435370e+00,359029,KET Inc.,Japan
5929246,114764179,2022,2022-12-31,1641.238617,1117.543920,4.313828e+04,362169,Chaosua Foods Industry Public Company Limited,Thailand
5929316,118989678,2021,2021-12-31,162.790078,201.931404,5.448875e+03,362779,Novamarine S.p.A.,Italy


**Perform QC with respect to null values**

In [17]:
emissions_df.dropna()

Unnamed: 0,institutionid,fiscalyear,periodenddate,di_319413,di_319414,di_319415,gvkey,companyname,country
15,11555,2021,2021-12-31,6896.779446,29580.824880,3.689112e+05,122954,"American Family Mutual Insurance Company, S.I.",United States
16,11555,2020,2020-12-31,6711.648272,25051.470514,3.180505e+05,122954,"American Family Mutual Insurance Company, S.I.",United States
161,13959,2022,2022-12-31,18853.165084,72589.700681,1.323541e+06,122594,State Farm Mutual Automobile Insurance Company,United States
162,13959,2023,2023-12-31,23250.617734,89521.063147,1.632254e+06,122594,State Farm Mutual Automobile Insurance Company,United States
163,13959,2021,2021-12-31,18241.016543,94615.486662,1.580437e+06,122594,State Farm Mutual Automobile Insurance Company,United States
...,...,...,...,...,...,...,...,...,...
5929180,113580486,2019,2019-12-31,17592.373778,1748.093285,5.985221e+03,50370,Sucro Limited,United States
5929209,114230580,2022,2022-03-31,2.811856,0.177205,1.435370e+00,359029,KET Inc.,Japan
5929246,114764179,2022,2022-12-31,1641.238617,1117.543920,4.313828e+04,362169,Chaosua Foods Industry Public Company Limited,Thailand
5929316,118989678,2021,2021-12-31,162.790078,201.931404,5.448875e+03,362779,Novamarine S.p.A.,Italy


We can see that the number of rows is not reduced, therefore there are no 
null values and the data quality has been verified.

**Write out the gvkeys for linking with fundamentals and returns data**

In [1664]:
write_ids(emissions_df, "gvkey", "gvkeys.txt")

**Inspect the fiscal years and the year-month accounting ends which are associated with them**

Create an extra column to isolate the month and year from the entire `periodenddate`. 
This column will be called `periodend_ym` to denote that it just retains the 
year and month information.
- This new column will be of the `period[M]` type

In [18]:
emissions_df = emissions_df.copy()
emissions_df["periodend_ym"] = emissions_df['periodenddate'].dt.to_period('M')

print(f"New Column Type: {emissions_df['periodend_ym'].dtype}")
emissions_df

New Column Type: period[M]


Unnamed: 0,institutionid,fiscalyear,periodenddate,di_319413,di_319414,di_319415,gvkey,companyname,country,periodend_ym
15,11555,2021,2021-12-31,6896.779446,29580.824880,3.689112e+05,122954,"American Family Mutual Insurance Company, S.I.",United States,2021-12
16,11555,2020,2020-12-31,6711.648272,25051.470514,3.180505e+05,122954,"American Family Mutual Insurance Company, S.I.",United States,2020-12
161,13959,2022,2022-12-31,18853.165084,72589.700681,1.323541e+06,122594,State Farm Mutual Automobile Insurance Company,United States,2022-12
162,13959,2023,2023-12-31,23250.617734,89521.063147,1.632254e+06,122594,State Farm Mutual Automobile Insurance Company,United States,2023-12
163,13959,2021,2021-12-31,18241.016543,94615.486662,1.580437e+06,122594,State Farm Mutual Automobile Insurance Company,United States,2021-12
...,...,...,...,...,...,...,...,...,...,...
5929180,113580486,2019,2019-12-31,17592.373778,1748.093285,5.985221e+03,50370,Sucro Limited,United States,2019-12
5929209,114230580,2022,2022-03-31,2.811856,0.177205,1.435370e+00,359029,KET Inc.,Japan,2022-03
5929246,114764179,2022,2022-12-31,1641.238617,1117.543920,4.313828e+04,362169,Chaosua Foods Industry Public Company Limited,Thailand,2022-12
5929316,118989678,2021,2021-12-31,162.790078,201.931404,5.448875e+03,362779,Novamarine S.p.A.,Italy,2021-12


Group by fiscal year and year-month ends to see which year-month ends are 
contained within each fiscal year.

In [19]:
emissions_df.groupby(["fiscalyear", "periodend_ym"]).size()

fiscalyear  periodend_ym
2013        2014-01            3
2014        2014-01           40
            2014-02           83
            2014-03          829
            2014-04           41
                            ... 
2023        2023-09          265
            2023-10           59
            2023-11           29
            2023-12         5628
            2024-01            5
Length: 131, dtype: int64

In [20]:
emissions_df.groupby("fiscalyear").size()

fiscalyear
2013        3
2014     5743
2015     5813
2016    13043
2017    13907
2018    15944
2019    16368
2020    22111
2021    22165
2022    22799
2023     9153
dtype: int64

**Finalise DataFrame**
- according to joining considerations

In [21]:
emissions_df = emissions_df.reset_index(drop=True)

emissions_df

Unnamed: 0,institutionid,fiscalyear,periodenddate,di_319413,di_319414,di_319415,gvkey,companyname,country,periodend_ym
0,11555,2021,2021-12-31,6896.779446,29580.824880,3.689112e+05,122954,"American Family Mutual Insurance Company, S.I.",United States,2021-12
1,11555,2020,2020-12-31,6711.648272,25051.470514,3.180505e+05,122954,"American Family Mutual Insurance Company, S.I.",United States,2020-12
2,13959,2022,2022-12-31,18853.165084,72589.700681,1.323541e+06,122594,State Farm Mutual Automobile Insurance Company,United States,2022-12
3,13959,2023,2023-12-31,23250.617734,89521.063147,1.632254e+06,122594,State Farm Mutual Automobile Insurance Company,United States,2023-12
4,13959,2021,2021-12-31,18241.016543,94615.486662,1.580437e+06,122594,State Farm Mutual Automobile Insurance Company,United States,2021-12
...,...,...,...,...,...,...,...,...,...,...
147044,113580486,2019,2019-12-31,17592.373778,1748.093285,5.985221e+03,50370,Sucro Limited,United States,2019-12
147045,114230580,2022,2022-03-31,2.811856,0.177205,1.435370e+00,359029,KET Inc.,Japan,2022-03
147046,114764179,2022,2022-12-31,1641.238617,1117.543920,4.313828e+04,362169,Chaosua Foods Industry Public Company Limited,Thailand,2022-12
147047,118989678,2021,2021-12-31,162.790078,201.931404,5.448875e+03,362779,Novamarine S.p.A.,Italy,2021-12


In [22]:
emissions_df = emissions_df.set_index(
    ["gvkey", "fiscalyear"]
)
emissions_df = emissions_df.sort_index()

emissions_df

Unnamed: 0_level_0,Unnamed: 1_level_0,institutionid,periodenddate,di_319413,di_319414,di_319415,companyname,country,periodend_ym
gvkey,fiscalyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1004,2016,4157610,2016-05-31,56034.630253,28700.181128,263831.791850,AAR Corp.,United States,2016-05
1004,2017,4157610,2017-05-31,59349.114332,33489.111842,311638.624770,AAR Corp.,United States,2017-05
1004,2018,4157610,2018-05-31,54842.261413,30334.505186,211206.593710,AAR Corp.,United States,2018-05
1004,2019,4157610,2019-05-31,62932.147241,32039.420175,217486.646950,AAR Corp.,United States,2019-05
1004,2020,4157610,2020-05-31,62592.179000,32015.494000,192435.070000,AAR Corp.,United States,2020-05
...,...,...,...,...,...,...,...,...,...
362761,2021,6627661,2021-03-31,5542.197413,4629.001659,34334.587431,Akums Drugs and Pharmaceuticals Limited,India,2021-03
362761,2022,6627661,2022-03-31,7202.802447,4349.187602,34489.242775,Akums Drugs and Pharmaceuticals Limited,India,2022-03
362761,2023,6627661,2023-03-31,6651.660029,4016.397443,31850.202650,Akums Drugs and Pharmaceuticals Limited,India,2023-03
362779,2020,118989678,2020-12-31,105.640229,119.005359,3249.154936,Novamarine S.p.A.,Italy,2020-12


In [23]:
emissions = emissions_df.drop(
    columns=[
        "institutionid", "periodenddate", "companyname", "periodend_ym"
    ]
)
# "companyid"

emissions

Unnamed: 0_level_0,Unnamed: 1_level_0,di_319413,di_319414,di_319415,country
gvkey,fiscalyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1004,2016,56034.630253,28700.181128,263831.791850,United States
1004,2017,59349.114332,33489.111842,311638.624770,United States
1004,2018,54842.261413,30334.505186,211206.593710,United States
1004,2019,62932.147241,32039.420175,217486.646950,United States
1004,2020,62592.179000,32015.494000,192435.070000,United States
...,...,...,...,...,...
362761,2021,5542.197413,4629.001659,34334.587431,India
362761,2022,7202.802447,4349.187602,34489.242775,India
362761,2023,6651.660029,4016.397443,31850.202650,India
362779,2020,105.640229,119.005359,3249.154936,Italy


**Inspect 0 GHG values across scopes 1, 2 and 3**

In [24]:
emissions_df[
    (emissions_df["di_319413"] == 0) 
    | (emissions_df["di_319414"] == 0)
]

Unnamed: 0_level_0,Unnamed: 1_level_0,institutionid,periodenddate,di_319413,di_319414,di_319415,companyname,country,periodend_ym
gvkey,fiscalyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2578,2022,4071224,2022-12-31,0.000000e+00,1112.00,8.428337e+03,Telos Corporation,United States,2022-12
4093,2014,4121470,2014-12-31,1.260000e+08,0.00,9.218064e+06,Duke Energy Corporation,United States,2014-12
12689,2019,4094395,2019-11-30,0.000000e+00,24025.00,9.032422e+05,KB Home,United States,2019-11
12689,2020,4094395,2020-11-30,0.000000e+00,19744.00,8.371673e+05,KB Home,United States,2020-11
15647,2015,4144815,2015-12-31,0.000000e+00,426.00,1.136394e+05,Storebrand ASA,Norway,2015-12
...,...,...,...,...,...,...,...,...,...
349529,2022,10691607,2022-12-31,0.000000e+00,80.48,1.263894e+03,Brii Biosciences Limited,China,2022-12
351514,2022,27663466,2022-12-31,0.000000e+00,25113.70,8.387886e+04,DR Corporation Limited,China,2022-12
351514,2023,27663466,2023-12-31,0.000000e+00,12902.78,4.470471e+04,DR Corporation Limited,China,2023-12
351587,2022,100596622,2022-12-31,0.000000e+00,311.79,1.197620e+05,"Hangzhou SF Intra-city Industrial Co., Ltd.",China,2022-12


In [25]:
emissions_df[
    (emissions_df["di_319415"] == 0)
]

Unnamed: 0_level_0,Unnamed: 1_level_0,institutionid,periodenddate,di_319413,di_319414,di_319415,companyname,country,periodend_ym
gvkey,fiscalyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


- todo: extract fundamentals data of the same fiscal years (also see the link between periodenddate and fiscal year)
- todo: market data of one year ahead from the fiscal/fundamentals data
  - or at least one month

**Phase 1 Context Awareness**

Number of US based companies out of the ~19k unique gvkeys

In [26]:
(emissions_df.groupby(level="gvkey").first()["country"] == "United States").sum()

4655

In [47]:
# pd.unique(emissions_df["country"])

## Currency Data
- currency exchange fluctuations, when taking USD as the base currency, cause FX returns that should be accounted for
- market value and other quantities must be converted into a common currency (USD) to facilitate comparisons

Load daily currency exchange rates to USD & GBP. Sort by origin currency and then date.

In [27]:
exrts_df = pd.read_csv(
    "exrts_2014to2024.csv", 
    parse_dates=["datadate"], 
)
exrts_df = exrts_df.sort_values(["curd", "datadate"])

exrts_df

Unnamed: 0,curd,datadate,exratd_toGBP,exratd_toUSD
0,AED,2014-01-01,0.164541,0.272250
174,AED,2014-01-02,0.165645,0.272254
348,AED,2014-01-03,0.165590,0.272264
522,AED,2014-01-04,0.165590,0.272264
692,AED,2014-01-05,0.165590,0.272264
...,...,...,...,...
628762,ZWL,2024-08-17,0.002350,0.003032
628915,ZWL,2024-08-18,0.002350,0.003032
629068,ZWL,2024-08-19,0.002350,0.003051
629221,ZWL,2024-08-20,0.002350,0.003060


In [28]:
exrts_df.dtypes

curd                    object
datadate        datetime64[ns]
exratd_toGBP           float64
exratd_toUSD           float64
dtype: object

Compute monthly currency rates.
- Based on the exchange rates at the end of the months i.e. the last ones.

In [29]:
exrts_df["data_ym"] = exrts_df["datadate"].dt.to_period('M')

exrts_df

Unnamed: 0,curd,datadate,exratd_toGBP,exratd_toUSD,data_ym
0,AED,2014-01-01,0.164541,0.272250,2014-01
174,AED,2014-01-02,0.165645,0.272254,2014-01
348,AED,2014-01-03,0.165590,0.272264,2014-01
522,AED,2014-01-04,0.165590,0.272264,2014-01
692,AED,2014-01-05,0.165590,0.272264,2014-01
...,...,...,...,...,...
628762,ZWL,2024-08-17,0.002350,0.003032,2024-08
628915,ZWL,2024-08-18,0.002350,0.003032,2024-08
629068,ZWL,2024-08-19,0.002350,0.003051,2024-08
629221,ZWL,2024-08-20,0.002350,0.003060,2024-08


In [30]:
m_exrts = exrts_df.groupby(
    ["curd", "data_ym"]
).last(
).drop(
    columns="datadate"
)

m_exrts

Unnamed: 0_level_0,Unnamed: 1_level_0,exratd_toGBP,exratd_toUSD
curd,data_ym,Unnamed: 2_level_1,Unnamed: 3_level_1
AED,2014-01,0.165667,0.272257
AED,2014-02,0.162459,0.272249
AED,2014-03,0.163316,0.272248
AED,2014-04,0.161238,0.272251
AED,2014-05,0.162301,0.272243
...,...,...,...
ZWL,2024-04,0.002350,0.002944
ZWL,2024-05,0.002350,0.002991
ZWL,2024-06,0.002350,0.002970
ZWL,2024-07,0.002350,0.003018


## Market Data
- Global Market Index Data

In [31]:
mkt_df = pd.read_csv(
    "market_prices_2014to2024.csv", 
    parse_dates=["datadate"], 
)
# year-month index
mkt_df["data_ym"] = mkt_df["datadate"].dt.to_period("M")
mkt_df = mkt_df.set_index("data_ym")
# compute monthly returns from prices
mkt_df["m_mktret"] = mkt_df["prccm"].pct_change() * 100
mkt_df = mkt_df.dropna()

mkt_df

Unnamed: 0_level_0,gvkeyx,prccm,datadate,conm,tic,m_mktret
data_ym,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-02,150918,1856.3682,2014-02-28,S&P Global 1200 Index,I6UNK112,4.694558
2014-03,150918,1858.1357,2014-03-31,S&P Global 1200 Index,I6UNK112,0.095213
2014-04,150918,1877.0979,2014-04-30,S&P Global 1200 Index,I6UNK112,1.020496
2014-05,150918,1906.7544,2014-05-31,S&P Global 1200 Index,I6UNK112,1.579912
2014-06,150918,1937.6656,2014-06-30,S&P Global 1200 Index,I6UNK112,1.621142
...,...,...,...,...,...,...
2024-03,150918,3774.8709,2024-03-31,S&P Global 1200 Index,I6UNK112,3.153459
2024-04,150918,3636.9339,2024-04-30,S&P Global 1200 Index,I6UNK112,-3.654085
2024-05,150918,3792.7481,2024-05-31,S&P Global 1200 Index,I6UNK112,4.284219
2024-06,150918,3871.9183,2024-06-30,S&P Global 1200 Index,I6UNK112,2.087410


In [32]:
mkt_df = mkt_df[["m_mktret"]]

mkt_df

Unnamed: 0_level_0,m_mktret
data_ym,Unnamed: 1_level_1
2014-02,4.694558
2014-03,0.095213
2014-04,1.020496
2014-05,1.579912
2014-06,1.621142
...,...
2024-03,3.153459
2024-04,-3.654085
2024-05,4.284219
2024-06,2.087410


## Returns Data

### NA & Global - Loading and Preliminary Processing

**NA**

In [33]:
na_returns_df = pd.read_csv(
    "na_security_returns_2014to2024.csv", 
    dtype={"iid": "str", "tpci": "str"}, 
    parse_dates=["datadate"], 
)
print(na_returns_df.dtypes)
na_returns_df = na_returns_df.sort_values(["gvkey", "iid", "datadate"])

na_returns_df

KeyboardInterrupt: 

In [168]:
na_returns = na_returns_df[
    (na_returns_df["tpci"] == "0")
]
na_returns = na_returns.sort_values(["gvkey", "iid", "datadate"])
na_returns["data_ym"] = na_returns["datadate"].dt.to_period('M')

print(na_returns.isnull().sum())
na_returns

gvkey             0
iid               0
datadate          0
conm              0
curcdd         6091
ajexdi         6091
cshoc         37292
prccd          6245
trfd        4148160
tpci              0
data_ym           0
dtype: int64


Unnamed: 0,gvkey,iid,datadate,conm,curcdd,ajexdi,cshoc,prccd,trfd,tpci,data_ym
0,1004,01,2014-01-02,AAR CORP,USD,1.0,39600000.0,27.360,1.619454,0,2014-01
1,1004,01,2014-01-03,AAR CORP,USD,1.0,39600000.0,26.880,1.619454,0,2014-01
2,1004,01,2014-01-06,AAR CORP,USD,1.0,39600000.0,26.780,1.619454,0,2014-01
3,1004,01,2014-01-07,AAR CORP,USD,1.0,39600000.0,26.510,1.619454,0,2014-01
4,1004,01,2014-01-08,AAR CORP,USD,1.0,39600000.0,26.730,1.619454,0,2014-01
...,...,...,...,...,...,...,...,...,...,...,...
17873581,351590,01,2024-08-19,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.374,,0,2024-08
17873582,351590,01,2024-08-20,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.270,,0,2024-08
17873583,351590,01,2024-08-21,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.270,,0,2024-08
17873584,351590,01,2024-08-22,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.540,,0,2024-08


In [193]:
na_returns = na_returns.dropna(subset="prccd")

print(na_returns.isnull().sum())
na_returns

gvkey             0
iid               0
datadate          0
conm              0
curcdd            0
ajexdi            0
cshoc         31201
prccd             0
trfd        4147467
tpci              0
data_ym           0
dtype: int64


Unnamed: 0,gvkey,iid,datadate,conm,curcdd,ajexdi,cshoc,prccd,trfd,tpci,data_ym
0,1004,01,2014-01-02,AAR CORP,USD,1.0,39600000.0,27.360,1.619454,0,2014-01
1,1004,01,2014-01-03,AAR CORP,USD,1.0,39600000.0,26.880,1.619454,0,2014-01
2,1004,01,2014-01-06,AAR CORP,USD,1.0,39600000.0,26.780,1.619454,0,2014-01
3,1004,01,2014-01-07,AAR CORP,USD,1.0,39600000.0,26.510,1.619454,0,2014-01
4,1004,01,2014-01-08,AAR CORP,USD,1.0,39600000.0,26.730,1.619454,0,2014-01
...,...,...,...,...,...,...,...,...,...,...,...
17873581,351590,01,2024-08-19,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.374,,0,2024-08
17873582,351590,01,2024-08-20,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.270,,0,2024-08
17873583,351590,01,2024-08-21,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.270,,0,2024-08
17873584,351590,01,2024-08-22,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.540,,0,2024-08


In [194]:
na_m_returns = na_returns.groupby(
    ["gvkey", "iid", "data_ym"]
).last()

na_m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdd,ajexdi,cshoc,prccd,trfd,tpci
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.650,1.623944,0
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.900,1.623944,0
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.950,1.623944,0
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.900,1.628586,0
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.300,1.628586,0
...,...,...,...,...,...,...,...,...,...,...
351590,01,2024-04,2024-04-30,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,46.864,,0
351590,01,2024-05,2024-05-31,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,41.880,,0
351590,01,2024-06,2024-06-28,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,38.622,,0
351590,01,2024-07,2024-07-31,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.938,,0


In [195]:
na_m_returns.isnull().sum()

datadate         0
conm             0
curcdd           0
ajexdi           0
cshoc         2507
prccd            0
trfd        213680
tpci             0
dtype: int64

In [196]:
trfd_allna = na_m_returns.isna().groupby(
    level=["gvkey", "iid"]
)["trfd"].all()

trfd_allna_idx = trfd_allna[trfd_allna].index
trfd_allna_idx

MultiIndex([(  1166, '02'),
            (  1712, '01'),
            (  1864, '01'),
            (  1932, '01'),
            (  2176, '01'),
            (  2176, '02'),
            (  2220, '01'),
            (  2250, '01'),
            (  2411, '01'),
            (  2578, '08'),
            ...
            (347007, '01'),
            (347328, '01'),
            (347471, '01'),
            (347708, '01'),
            (348615, '01'),
            (349485, '01'),
            (350366, '01'),
            (350952, '01'),
            (351491, '01'),
            (351590, '01')],
           names=['gvkey', 'iid'], length=2571)

In [197]:
na_m_returns.loc[
    np.isin(
        na_m_returns.index.droplevel("data_ym"), 
        trfd_allna_idx
    ), 
    "trfd"
] = 1

na_m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdd,ajexdi,cshoc,prccd,trfd,tpci
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.650,1.623944,0
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.900,1.623944,0
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.950,1.623944,0
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.900,1.628586,0
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.300,1.628586,0
...,...,...,...,...,...,...,...,...,...,...
351590,01,2024-04,2024-04-30,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,46.864,1.000000,0
351590,01,2024-05,2024-05-31,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,41.880,1.000000,0
351590,01,2024-06,2024-06-28,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,38.622,1.000000,0
351590,01,2024-07,2024-07-31,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.938,1.000000,0


In [198]:
na_m_returns.isnull().sum()

datadate       0
conm           0
curcdd         0
ajexdi         0
cshoc       2507
prccd          0
trfd           0
tpci           0
dtype: int64

In [199]:
na_m_returns = na_m_returns.dropna()

na_m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdd,ajexdi,cshoc,prccd,trfd,tpci
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.650,1.623944,0
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.900,1.623944,0
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.950,1.623944,0
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.900,1.628586,0
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.300,1.628586,0
...,...,...,...,...,...,...,...,...,...,...
351590,01,2024-04,2024-04-30,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,46.864,1.000000,0
351590,01,2024-05,2024-05-31,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,41.880,1.000000,0
351590,01,2024-06,2024-06-28,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,38.622,1.000000,0
351590,01,2024-07,2024-07-31,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.938,1.000000,0


In [202]:
na_m_returns = na_m_returns.drop(columns="tpci").rename(
    columns={
        "curcdd": "curcdm", 
        "cshoc": "cshom", 
        "prccd": "prccm", 
        "ajexdi": "ajexm", 
        "trfd": "trfm", 
    }
)

na_m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.650,1.623944
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.900,1.623944
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.950,1.623944
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.900,1.628586
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.300,1.628586
...,...,...,...,...,...,...,...,...,...
351590,01,2024-04,2024-04-30,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,46.864,1.000000
351590,01,2024-05,2024-05-31,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,41.880,1.000000
351590,01,2024-06,2024-06-28,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,38.622,1.000000
351590,01,2024-07,2024-07-31,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.938,1.000000


In [203]:
na_m_returns = na_m_returns.reset_index()
na_m_returns.to_csv(
    "na_security_m_returns_2014to2024.csv", 
    index=False, header=True, 
)

na_m_returns

Unnamed: 0,gvkey,iid,data_ym,datadate,conm,curcdm,ajexm,cshom,prccm,trfm
0,1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.650,1.623944
1,1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.900,1.623944
2,1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.950,1.623944
3,1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.900,1.628586
4,1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.300,1.628586
...,...,...,...,...,...,...,...,...,...,...
617266,351590,01,2024-04,2024-04-30,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,46.864,1.000000
617267,351590,01,2024-05,2024-05-31,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,41.880,1.000000
617268,351590,01,2024-06,2024-06-28,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,38.622,1.000000
617269,351590,01,2024-07,2024-07-31,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.938,1.000000


In [34]:
na_m_returns = pd.read_csv(
    "na_security_m_returns_2014to2024.csv", 
    parse_dates=["datadate", "data_ym"], 
)
na_m_returns["data_ym"] = na_m_returns["data_ym"].dt.to_period('M')
na_m_returns = na_m_returns.set_index(["gvkey", "iid", "data_ym"])

na_m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.650,1.623944
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.900,1.623944
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.950,1.623944
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.900,1.628586
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.300,1.628586
...,...,...,...,...,...,...,...,...,...
351590,01,2024-04,2024-04-30,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,46.864,1.000000
351590,01,2024-05,2024-05-31,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,41.880,1.000000
351590,01,2024-06,2024-06-28,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,38.622,1.000000
351590,01,2024-07,2024-07-31,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.938,1.000000


**Global**

In [48]:
global_returns_df = pd.read_csv(
    "global_security_returns_2014to2024.csv", 
    dtype={
        "tpci": "str"
    }, 
    parse_dates=["datadate"], 
)

global_returns_df

Unnamed: 0,gvkey,iid,datadate,conm,curcdd,ajexdi,cshoc,prccd,trfd,tpci
0,1166,01W,2014-01-01,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.950,1.242834,0
1,1166,01W,2014-01-02,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.855,1.242834,0
2,1166,01W,2014-01-03,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.880,1.242834,0
3,1166,01W,2014-01-06,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.575,1.242834,0
4,1166,01W,2014-01-07,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.535,1.242834,0
...,...,...,...,...,...,...,...,...,...,...
66927973,362779,01W,2024-08-15,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0
66927974,362779,01W,2024-08-16,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0
66927975,362779,01W,2024-08-19,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0
66927976,362779,01W,2024-08-20,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0


In [1763]:
global_returns_df.dtypes

gvkey                int64
iid                 object
datadate    datetime64[ns]
conm                object
curcdd              object
ajexdi             float64
cshoc              float64
prccd              float64
trfd               float64
tpci                object
dtype: object

In [56]:
global_returns_df.isnull().sum()

gvkey            0
iid              0
datadate         0
conm             0
curcdd        9037
ajexdi        9037
cshoc       765959
prccd         9037
trfd          2634
tpci             0
dtype: int64

In [192]:
global_returns_df[
    global_returns_df["prccd"].isna()
].isna().sum()

gvkey          0
iid            0
datadate       0
conm           0
curcdd      9037
ajexdi      9037
cshoc       9037
prccd       9037
trfd        2634
tpci           0
dtype: int64

We are only interested in the returns and market cap from ordinary stock.

In [87]:
global_returns = global_returns_df[
    (global_returns_df["tpci"] == "0")
]

global_returns

Unnamed: 0,gvkey,iid,datadate,conm,curcdd,ajexdi,cshoc,prccd,trfd,tpci
0,1166,01W,2014-01-01,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.950,1.242834,0
1,1166,01W,2014-01-02,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.855,1.242834,0
2,1166,01W,2014-01-03,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.880,1.242834,0
3,1166,01W,2014-01-06,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.575,1.242834,0
4,1166,01W,2014-01-07,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.535,1.242834,0
...,...,...,...,...,...,...,...,...,...,...
66927973,362779,01W,2024-08-15,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0
66927974,362779,01W,2024-08-16,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0
66927975,362779,01W,2024-08-19,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0
66927976,362779,01W,2024-08-20,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0


Sort by company, issue and then finally date.

In [88]:
global_returns = global_returns.sort_values(["gvkey", "iid", "datadate"])

global_returns

Unnamed: 0,gvkey,iid,datadate,conm,curcdd,ajexdi,cshoc,prccd,trfd,tpci
0,1166,01W,2014-01-01,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.950,1.242834,0
1,1166,01W,2014-01-02,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.855,1.242834,0
2,1166,01W,2014-01-03,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.880,1.242834,0
3,1166,01W,2014-01-06,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.575,1.242834,0
4,1166,01W,2014-01-07,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.535,1.242834,0
...,...,...,...,...,...,...,...,...,...,...
66927973,362779,01W,2024-08-15,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0
66927974,362779,01W,2024-08-16,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0
66927975,362779,01W,2024-08-19,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0
66927976,362779,01W,2024-08-20,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0


Add the year-month, which will be aggregated on shortly.

In [89]:
global_returns["data_ym"] = global_returns["datadate"].dt.to_period('M')

global_returns

Unnamed: 0,gvkey,iid,datadate,conm,curcdd,ajexdi,cshoc,prccd,trfd,tpci,data_ym
0,1166,01W,2014-01-01,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.950,1.242834,0,2014-01
1,1166,01W,2014-01-02,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.855,1.242834,0,2014-01
2,1166,01W,2014-01-03,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.880,1.242834,0,2014-01
3,1166,01W,2014-01-06,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.575,1.242834,0,2014-01
4,1166,01W,2014-01-07,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,23.535,1.242834,0,2014-01
...,...,...,...,...,...,...,...,...,...,...,...
66927973,362779,01W,2024-08-15,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0,2024-08
66927974,362779,01W,2024-08-16,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0,2024-08
66927975,362779,01W,2024-08-19,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0,2024-08
66927976,362779,01W,2024-08-20,NOVAMARINE SPA,EUR,1.0,12388500.0,3.580,1.145429,0,2024-08


In [96]:
global_returns = global_returns.groupby(
    ["gvkey", "iid", "data_ym"]
).last()

global_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdd,ajexdi,cshoc,prccd,trfd,tpci
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1166,01W,2014-01,2014-01-31,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,24.850,1.242834,0
1166,01W,2014-02,2014-02-28,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,28.015,1.242834,0
1166,01W,2014-03,2014-03-31,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,29.135,1.242834,0
1166,01W,2014-04,2014-04-30,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,31.495,1.242834,0
1166,01W,2014-05,2014-05-30,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,30.545,1.264115,0
...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.820,1.147992,0
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.630,1.147992,0
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.220,1.147992,0
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.750,1.147992,0


Drop null values and finalise to the monthly format, assuming ordinary share 
focus.

In [107]:
global_returns = global_returns.dropna()
global_returns = global_returns.drop(columns="tpci").rename(
    columns={
        "curcdd": "curcdm", 
        "cshoc": "cshom", 
        "prccd": "prccm", 
        "ajexdi": "ajexm", 
        "trfd": "trfm", 
    }
)

global_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1166,01W,2014-01,2014-01-31,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,24.850,1.242834
1166,01W,2014-02,2014-02-28,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,28.015,1.242834
1166,01W,2014-03,2014-03-31,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,29.135,1.242834
1166,01W,2014-04,2014-04-30,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,31.495,1.242834
1166,01W,2014-05,2014-05-30,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,30.545,1.264115
...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.820,1.147992
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.630,1.147992
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.220,1.147992
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.750,1.147992


Reset the index so that it is in a format readily exportable to csv, or 
equivalently assumed to be directly read in from csv.

In [109]:
global_returns = global_returns.reset_index()

global_returns

Unnamed: 0,gvkey,iid,data_ym,datadate,conm,curcdm,ajexm,cshom,prccm,trfm
0,1166,01W,2014-01,2014-01-31,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,24.850,1.242834
1,1166,01W,2014-02,2014-02-28,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,28.015,1.242834
2,1166,01W,2014-03,2014-03-31,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,29.135,1.242834
3,1166,01W,2014-04,2014-04-30,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,31.495,1.242834
4,1166,01W,2014-05,2014-05-30,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,30.545,1.264115
...,...,...,...,...,...,...,...,...,...,...
2861217,362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.820,1.147992
2861218,362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.630,1.147992
2861219,362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.220,1.147992
2861220,362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.750,1.147992


In [110]:
global_returns.to_csv(
    "global_security_m_returns_2014to2024.csv", 
    index=False, header=True, 
)

In [35]:
global_returns = pd.read_csv(
    "global_security_m_returns_2014to2024.csv", 
    parse_dates=["datadate", "data_ym"], 
)
global_returns["data_ym"] = global_returns["data_ym"].dt.to_period('M')
global_returns = global_returns.set_index(["gvkey", "iid", "data_ym"])

global_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1166,01W,2014-01,2014-01-31,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,24.850,1.242834
1166,01W,2014-02,2014-02-28,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,28.015,1.242834
1166,01W,2014-03,2014-03-31,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,29.135,1.242834
1166,01W,2014-04,2014-04-30,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,31.495,1.242834
1166,01W,2014-05,2014-05-30,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,30.545,1.264115
...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.820,1.147992
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.630,1.147992
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.220,1.147992
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.750,1.147992


In [36]:
global_m_returns = global_returns

global_m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1166,01W,2014-01,2014-01-31,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,24.850,1.242834
1166,01W,2014-02,2014-02-28,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,28.015,1.242834
1166,01W,2014-03,2014-03-31,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,29.135,1.242834
1166,01W,2014-04,2014-04-30,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,31.495,1.242834
1166,01W,2014-05,2014-05-30,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,30.545,1.264115
...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.820,1.147992
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.630,1.147992
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.220,1.147992
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.750,1.147992


**Remarks**

*Notice that across both the NA and Global return data, we make up almost 17k 
out the ~19k companies in the emissions data*

Determine the number of companies that are in both the NA and Global datasets

In [1457]:
na_returns_uniq_gvkey = na_returns_df["gvkey"].unique()
global_returns_uniq_gvkey = global_returns_df["gvkey"].unique()

na_returns_uniq_gvkey[
    np.isin(na_returns_uniq_gvkey, global_returns_uniq_gvkey)
].size

Flushing oldest 200 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


972

### NA & Global - Data Preparation

In [205]:
na_m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.650,1.623944
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.900,1.623944
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.950,1.623944
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.900,1.628586
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.300,1.628586
...,...,...,...,...,...,...,...,...,...
351590,01,2024-04,2024-04-30,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,46.864,1.000000
351590,01,2024-05,2024-05-31,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,41.880,1.000000
351590,01,2024-06,2024-06-28,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,38.622,1.000000
351590,01,2024-07,2024-07-31,DAIMLER TRUCK HOLDING AG,USD,1.0,822952000.0,37.938,1.000000


In [206]:
global_m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1166,01W,2014-01,2014-01-31,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,24.850,1.242834
1166,01W,2014-02,2014-02-28,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,28.015,1.242834
1166,01W,2014-03,2014-03-31,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,29.135,1.242834
1166,01W,2014-04,2014-04-30,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,31.495,1.242834
1166,01W,2014-05,2014-05-30,ASM INTERNATIONAL NV,EUR,1.0,63076035.0,30.545,1.264115
...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.820,1.147992
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.630,1.147992
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.220,1.147992
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.750,1.147992


In [37]:
m_returns = pd.concat(
    [na_m_returns.reset_index(), global_m_returns.reset_index()], 
    ignore_index=True, 
).groupby(
    ["gvkey", "iid", "data_ym"]
).first().sort_index()

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586
...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992


**Add year-month**

Isolate the year-month from the data point dates, as the data points 
correspond to montly returns, and we are only interested in this 
level of granularity.

Missing values must be addressed - by imputating where possible first, and then 
finally dropping rows (which have null values).
- For imputation, we must determine which variables have missing values and 
decide on the appropriate corrective action for each.

We can see there are missing values in the close prices, adjustment factors and total return factors.

**Imputation**

In [856]:
""" TODO: nothing right now, but can look into what imputation is possible later """

""" if prices and 
adjustment factors are enough to calculate returns, as long as the total return 
factor is all null or all non-null for each issue. """

' TODO: nothing right now, but can look into what imputation is possible later '

**Drop Rows**

Check if the issues have distinct data for each year-month now.

In [38]:
sec_ym_entries = m_returns.groupby(level=["gvkey", "iid", "data_ym"]).size()

sec_ym_entries

gvkey   iid  data_ym
1004    01   2014-01    1
             2014-02    1
             2014-03    1
             2014-04    1
             2014-05    1
                       ..
362705  01W  2024-08    1
        02W  2024-08    1
362758  01W  2024-08    1
362761  01W  2024-08    1
362779  01W  2024-08    1
Length: 3478493, dtype: int64

In [39]:
sec_ym_entries[
    sec_ym_entries != 1
]

Series([], dtype: int64)

The empty series return indicates that this is indeed the case.

**Reindex & Sort**
- by `gvkey`, `iid` and then `data_ym` 
- we know `data_ym` is particularly appropriate 
for indexing now due to its uniqueness within security issues

**Calculate returns.**
- Utilising the year-month entries of each security issue

#### Can skip

Check the number of year-month data records for each issue

In [1127]:
na_returns_issue_yms = na_returns_df.reset_index(
).groupby(
    ["gvkey", "iid"]
).agg(
    {
        "data_ym": lambda vals: vals.count()
    }
)

na_returns_issue_yms

Unnamed: 0_level_0,Unnamed: 1_level_0,data_ym
gvkey,iid,Unnamed: 2_level_1
100001,91,32
100012,01,32
100013,90,32
100022,01,2
100022,02,16
...,...,...
350952,90,31
351491,01,31
351590,01,18
351590,90,32


We can see there are 2204 issues from <1900 companies with varying numbers of 
data points.

In [1128]:
na_returns_df.xs((100022, "01"), level=("gvkey", "iid"))

Unnamed: 0_level_0,datadate,conm,ajexm,curcdm,prccm,trfm,cshom
data_ym,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-07,2022-07-31,BAYER MOTOREN WERKE AG,1.0,USD,73.8,1.0,59404000.0
2024-02,2024-02-29,BAYER MOTOREN WERKE AG,1.0,USD,109.5,1.0,60844000.0


We also observe that the year-month entries of security issues can be 
non-contiguous when they are incomplete.

In [1129]:
na_returns_max_issue_yms = na_returns_issue_yms["data_ym"].max()
print(f"Maximum issue data points: {na_returns_max_issue_yms}")

na_returns_full_issue = na_returns_issue_yms[
    na_returns_issue_yms["data_ym"] == na_returns_max_issue_yms
]

na_returns_full_issue

Maximum issue data points: 32


Unnamed: 0_level_0,Unnamed: 1_level_0,data_ym
gvkey,iid,Unnamed: 2_level_1
100001,91,32
100012,01,32
100013,90,32
100022,90,32
100045,90,32
...,...,...
347328,90,32
347708,90,32
349485,90,32
350366,90,32


We observe there are fewer issues with the maximum number of data points.

#### Continue

Calculate returns as per the Compustat manual. To do this we first need 
adjusted close.

In [40]:
m_returns["adjclose"] = (m_returns["prccm"] / m_returns["ajexm"]) * m_returns["trfm"]

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944,43.278106
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944,46.931980
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944,42.141346
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586,42.180366
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586,39.574629
...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992,3.237339
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992,3.019220
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992,0.252558
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992,1160.333318


Exchange rates to the home currency (USD) are joined to allow computation of FX 
returns - that form a part of the overall return.

In [41]:
m_returns = m_returns.reset_index().merge(
    m_exrts, 
    how="left", 
    left_on=["curcdm", "data_ym"], 
    right_index=True, 
).set_index(
    ["gvkey", "iid", "data_ym"]
)
m_returns = m_returns.drop(columns="exratd_toGBP")
m_returns = m_returns.dropna()

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944,43.278106,1.000000
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944,46.931980,1.000000
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944,42.141346,1.000000
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586,42.180366,1.000000
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586,39.574629,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992,3.237339,1.113108
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992,3.019220,1.113108
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992,0.252558,0.675052
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992,1160.333318,0.011920


From the adjusted closes and exchange rates, we can now compute the local and FX returns.

In [42]:
m_returns = pd.concat(
    [
        m_returns, 
        m_returns.groupby(
            level=["gvkey", "iid"]
        )[["adjclose", "exratd_toUSD"]].pct_change(
        ).rename(
            columns={
                "adjclose": "local_ret", 
                "exratd_toUSD": "USD_fxret", 
            }
        )
    ], 
    axis=1
)

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,USD_fxret
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944,43.278106,1.000000,,
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944,46.931980,1.000000,0.084428,0.000000
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944,42.141346,1.000000,-0.102076,0.000000
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586,42.180366,1.000000,0.000926,0.000000
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586,39.574629,1.000000,-0.061776,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992,3.237339,1.113108,-0.066225,0.028504
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992,3.019220,1.113108,,
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992,0.252558,0.675052,,
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992,1160.333318,0.011920,,


Combine local and FX returns to obtain USD returns. Note that these are 
raw returns with varying frequencies.

In [43]:
m_returns["USD_ret"] = (
    (1 + m_returns["local_ret"]) * (1 + m_returns["USD_fxret"]) - 1
)

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,USD_fxret,USD_ret
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944,43.278106,1.000000,,,
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944,46.931980,1.000000,0.084428,0.000000,0.084428
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944,42.141346,1.000000,-0.102076,0.000000,-0.102076
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586,42.180366,1.000000,0.000926,0.000000,0.000926
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586,39.574629,1.000000,-0.061776,0.000000,-0.061776
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992,3.237339,1.113108,-0.066225,0.028504,-0.039608
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992,3.019220,1.113108,,,
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992,0.252558,0.675052,,,
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992,1160.333318,0.011920,,,


The differences in months of these raw returns can be used to standardise them 
to a common frequency (monthly).

In [44]:
""" Differences in months, that can be used to standardise returns to the 
same frequency - monthly """
m_returns["ret_mspan"] = m_returns.reset_index(
).groupby(
    ["gvkey", "iid"]
)["data_ym"].diff(
).apply(
    lambda x: x.n if pd.notnull(x) else np.nan
).values

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,USD_fxret,USD_ret,ret_mspan
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944,43.278106,1.000000,,,,
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944,46.931980,1.000000,0.084428,0.000000,0.084428,1.0
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944,42.141346,1.000000,-0.102076,0.000000,-0.102076,1.0
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586,42.180366,1.000000,0.000926,0.000000,0.000926,1.0
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586,39.574629,1.000000,-0.061776,0.000000,-0.061776,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992,3.237339,1.113108,-0.066225,0.028504,-0.039608,1.0
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992,3.019220,1.113108,,,,
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992,0.252558,0.675052,,,,
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992,1160.333318,0.011920,,,,


*Sanity check - there should be 0 return month frequencies that 
are non-null for the first year-month entries of issues*

In [45]:
pd.notnull(m_returns.groupby(
    level=["gvkey", "iid"]
).agg(
    {
        "ret_mspan": lambda vals: vals.iloc[0]
    }
)).sum()

ret_mspan    0
dtype: int64

Standardise the raw USD returns to the monthly frequency.

In [46]:
def compute_m_returns(df, rr_label, rspan_label="ret_mspan"):
    df[f"m_{rr_label}"] = (
        (1 + df[rr_label]) ** (1/df[rspan_label])
    ) - 1

compute_m_returns(m_returns, "USD_ret")
compute_m_returns(m_returns, "local_ret")

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,USD_fxret,USD_ret,ret_mspan,m_USD_ret,m_local_ret
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944,43.278106,1.000000,,,,,,
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944,46.931980,1.000000,0.084428,0.000000,0.084428,1.0,0.084428,0.084428
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944,42.141346,1.000000,-0.102076,0.000000,-0.102076,1.0,-0.102076,-0.102076
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586,42.180366,1.000000,0.000926,0.000000,0.000926,1.0,0.000926,0.000926
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586,39.574629,1.000000,-0.061776,0.000000,-0.061776,1.0,-0.061776,-0.061776
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992,3.237339,1.113108,-0.066225,0.028504,-0.039608,1.0,-0.039608,-0.066225
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992,3.019220,1.113108,,,,,,
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992,0.252558,0.675052,,,,,,
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992,1160.333318,0.011920,,,,,,


*Notice that there are companies that have gone bankrupt (0 raw close price & 
0 adjusted close price), which leads to further null returns beyond just the 
first month for some companies (due to division by 0).*

In [47]:
m_returns = m_returns.copy()

At this point, the computation of monthly returns has been completed.
- w.r.t base currencies of USD and GBP.

**Calculate monthly market capitalisation**
- standardise the currency (USD)

First, per security

In [48]:
m_returns["USD_secval"] = m_returns["cshom"] * m_returns["prccm"] * m_returns["exratd_toUSD"]

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,USD_fxret,USD_ret,ret_mspan,m_USD_ret,m_local_ret,USD_secval
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944,43.278106,1.000000,,,,,,,1.055340e+09
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944,46.931980,1.000000,0.084428,0.000000,0.084428,1.0,0.084428,0.084428,1.144440e+09
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944,42.141346,1.000000,-0.102076,0.000000,-0.102076,1.0,-0.102076,-0.102076,1.026816e+09
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586,42.180366,1.000000,0.000926,0.000000,0.000926,1.0,0.000926,0.000926,1.024837e+09
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586,39.574629,1.000000,-0.061776,0.000000,-0.061776,1.0,-0.061776,-0.061776,9.615267e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992,3.237339,1.113108,-0.066225,0.028504,-0.039608,1.0,-0.039608,-0.066225,8.305700e+07
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992,3.019220,1.113108,,,,,,,7.746096e+07
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992,0.252558,0.675052,,,,,,,2.294501e+07
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992,1160.333318,0.011920,,,,,,,1.896297e+09


Now aggregate by gvkey and year-month to determine the total market value from 
all the securities.

In [49]:
m_returns = m_returns.join(
    m_returns.groupby(
        level=["gvkey", "data_ym"]
    )["USD_secval"].sum().rename("USD_mktval"), 
    on=["gvkey", "data_ym"]
)

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,USD_fxret,USD_ret,ret_mspan,m_USD_ret,m_local_ret,USD_secval,USD_mktval
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944,43.278106,1.000000,,,,,,,1.055340e+09,1.055340e+09
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944,46.931980,1.000000,0.084428,0.000000,0.084428,1.0,0.084428,0.084428,1.144440e+09,1.144440e+09
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944,42.141346,1.000000,-0.102076,0.000000,-0.102076,1.0,-0.102076,-0.102076,1.026816e+09,1.026816e+09
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586,42.180366,1.000000,0.000926,0.000000,0.000926,1.0,0.000926,0.000926,1.024837e+09,1.024837e+09
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586,39.574629,1.000000,-0.061776,0.000000,-0.061776,1.0,-0.061776,-0.061776,9.615267e+08,9.615267e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992,3.237339,1.113108,-0.066225,0.028504,-0.039608,1.0,-0.039608,-0.066225,8.305700e+07,1.605180e+08
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992,3.019220,1.113108,,,,,,,7.746096e+07,1.605180e+08
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992,0.252558,0.675052,,,,,,,2.294501e+07,2.294501e+07
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992,1160.333318,0.011920,,,,,,,1.896297e+09,1.896297e+09


Shift the calculated market values one forward, in each security, to prevent look-ahead bias in return predictions.

In [50]:
m_returns[
    "X_USD_mktval"
] = m_returns.groupby(
    level=["gvkey", "iid"]
)[
    "USD_mktval"
].shift()

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,USD_fxret,USD_ret,ret_mspan,m_USD_ret,m_local_ret,USD_secval,USD_mktval,X_USD_mktval
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944,43.278106,1.000000,,,,,,,1.055340e+09,1.055340e+09,
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944,46.931980,1.000000,0.084428,0.000000,0.084428,1.0,0.084428,0.084428,1.144440e+09,1.144440e+09,1.055340e+09
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944,42.141346,1.000000,-0.102076,0.000000,-0.102076,1.0,-0.102076,-0.102076,1.026816e+09,1.026816e+09,1.144440e+09
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586,42.180366,1.000000,0.000926,0.000000,0.000926,1.0,0.000926,0.000926,1.024837e+09,1.024837e+09,1.026816e+09
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586,39.574629,1.000000,-0.061776,0.000000,-0.061776,1.0,-0.061776,-0.061776,9.615267e+08,9.615267e+08,1.024837e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992,3.237339,1.113108,-0.066225,0.028504,-0.039608,1.0,-0.039608,-0.066225,8.305700e+07,1.605180e+08,8.648243e+07
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992,3.019220,1.113108,,,,,,,7.746096e+07,1.605180e+08,
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992,0.252558,0.675052,,,,,,,2.294501e+07,2.294501e+07,
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992,1160.333318,0.011920,,,,,,,1.896297e+09,1.896297e+09,


**Finalise**

Returns from fractions to percentages.

In [51]:
m_returns["local_ret"] *= 100
m_returns["USD_fxret"] *= 100
# 
m_returns["USD_ret"] *= 100
# 
m_returns["m_USD_ret"] *= 100
m_returns["m_local_ret"] *= 100

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,USD_fxret,USD_ret,ret_mspan,m_USD_ret,m_local_ret,USD_secval,USD_mktval,X_USD_mktval
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944,43.278106,1.000000,,,,,,,1.055340e+09,1.055340e+09,
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944,46.931980,1.000000,8.442777,0.000000,8.442777,1.0,8.442777,8.442777,1.144440e+09,1.144440e+09,1.055340e+09
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944,42.141346,1.000000,-10.207612,0.000000,-10.207612,1.0,-10.207612,-10.207612,1.026816e+09,1.026816e+09,1.144440e+09
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586,42.180366,1.000000,0.092594,0.000000,0.092594,1.0,0.092594,0.092594,1.024837e+09,1.024837e+09,1.026816e+09
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586,39.574629,1.000000,-6.177606,0.000000,-6.177606,1.0,-6.177606,-6.177606,9.615267e+08,9.615267e+08,1.024837e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992,3.237339,1.113108,-6.622517,2.850443,-3.960845,1.0,-3.960845,-6.622517,8.305700e+07,1.605180e+08,8.648243e+07
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992,3.019220,1.113108,,,,,,,7.746096e+07,1.605180e+08,
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992,0.252558,0.675052,,,,,,,2.294501e+07,2.294501e+07,
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992,1160.333318,0.011920,,,,,,,1.896297e+09,1.896297e+09,


Augment with year columns for linking with fundamentals and emissions data.

In [52]:
m_returns["datayear"] = m_returns.reset_index()["data_ym"].dt.year.values
m_returns["datayear-1"] = m_returns["datayear"] - 1

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,USD_fxret,USD_ret,ret_mspan,m_USD_ret,m_local_ret,USD_secval,USD_mktval,X_USD_mktval,datayear,datayear-1
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944,43.278106,1.000000,,,,,,,1.055340e+09,1.055340e+09,,2014,2013
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944,46.931980,1.000000,8.442777,0.000000,8.442777,1.0,8.442777,8.442777,1.144440e+09,1.144440e+09,1.055340e+09,2014,2013
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944,42.141346,1.000000,-10.207612,0.000000,-10.207612,1.0,-10.207612,-10.207612,1.026816e+09,1.026816e+09,1.144440e+09,2014,2013
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586,42.180366,1.000000,0.092594,0.000000,0.092594,1.0,0.092594,0.092594,1.024837e+09,1.024837e+09,1.026816e+09,2014,2013
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586,39.574629,1.000000,-6.177606,0.000000,-6.177606,1.0,-6.177606,-6.177606,9.615267e+08,9.615267e+08,1.024837e+09,2014,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992,3.237339,1.113108,-6.622517,2.850443,-3.960845,1.0,-3.960845,-6.622517,8.305700e+07,1.605180e+08,8.648243e+07,2024,2023
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992,3.019220,1.113108,,,,,,,7.746096e+07,1.605180e+08,,2024,2023
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992,0.252558,0.675052,,,,,,,2.294501e+07,2.294501e+07,,2024,2023
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992,1160.333318,0.011920,,,,,,,1.896297e+09,1.896297e+09,,2024,2023


Join market return in.

In [53]:
m_returns = m_returns.join(
    mkt_df, 
    how="left", 
    on="data_ym", 
)

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,...,USD_ret,ret_mspan,m_USD_ret,m_local_ret,USD_secval,USD_mktval,X_USD_mktval,datayear,datayear-1,m_mktret
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944,43.278106,1.000000,,...,,,,,1.055340e+09,1.055340e+09,,2014,2013,
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944,46.931980,1.000000,8.442777,...,8.442777,1.0,8.442777,8.442777,1.144440e+09,1.144440e+09,1.055340e+09,2014,2013,4.694558
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944,42.141346,1.000000,-10.207612,...,-10.207612,1.0,-10.207612,-10.207612,1.026816e+09,1.026816e+09,1.144440e+09,2014,2013,0.095213
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586,42.180366,1.000000,0.092594,...,0.092594,1.0,0.092594,0.092594,1.024837e+09,1.024837e+09,1.026816e+09,2014,2013,1.020496
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586,39.574629,1.000000,-6.177606,...,-6.177606,1.0,-6.177606,-6.177606,9.615267e+08,9.615267e+08,1.024837e+09,2014,2013,1.579912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992,3.237339,1.113108,-6.622517,...,-3.960845,1.0,-3.960845,-6.622517,8.305700e+07,1.605180e+08,8.648243e+07,2024,2023,
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992,3.019220,1.113108,,...,,,,,7.746096e+07,1.605180e+08,,2024,2023,
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992,0.252558,0.675052,,...,,,,,2.294501e+07,2.294501e+07,,2024,2023,
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992,1160.333318,0.011920,,...,,,,,1.896297e+09,1.896297e+09,,2024,2023,


Compute rolling beta within issues.
- From rolling covariances and variances.

In [54]:
m_return_betas = m_returns.reset_index(
).groupby(
    ["gvkey", "iid"] # within issues
)[["m_USD_ret", "m_mktret"]].rolling(
    12
).cov().unstack()["m_mktret"].rename(
    columns={
        "m_USD_ret": "cov(i,m)", 
        "m_mktret": "var(m)", 
    }
).set_index(m_returns.index)

m_return_betas

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,"cov(i,m)",var(m)
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1
1004,01,2014-01,,
1004,01,2014-02,,
1004,01,2014-03,,
1004,01,2014-04,,
1004,01,2014-05,,
...,...,...,...,...
362705,01W,2024-08,,
362705,02W,2024-08,,
362758,01W,2024-08,,
362761,01W,2024-08,,


Once again shift the calculated betas one forward, in each security, to prevent look-ahead bias.

In [55]:
m_return_betas["beta"] = m_return_betas["cov(i,m)"] / m_return_betas["var(m)"]
m_return_betas = m_return_betas.drop(columns=["cov(i,m)", "var(m)"])

m_return_betas["X_beta"] = m_return_betas.groupby(
    level=["gvkey", "iid"]
).shift()

m_return_betas

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,beta,X_beta
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1
1004,01,2014-01,,
1004,01,2014-02,,
1004,01,2014-03,,
1004,01,2014-04,,
1004,01,2014-05,,
...,...,...,...,...
362705,01W,2024-08,,
362705,02W,2024-08,,
362758,01W,2024-08,,
362761,01W,2024-08,,


Beta is concatenated together with the returns, aligning with the index.

In [56]:
m_returns = pd.concat([m_returns, m_return_betas], axis=1)

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,...,m_USD_ret,m_local_ret,USD_secval,USD_mktval,X_USD_mktval,datayear,datayear-1,m_mktret,beta,X_beta
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944,43.278106,1.000000,,...,,,1.055340e+09,1.055340e+09,,2014,2013,,,
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944,46.931980,1.000000,8.442777,...,8.442777,8.442777,1.144440e+09,1.144440e+09,1.055340e+09,2014,2013,4.694558,,
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944,42.141346,1.000000,-10.207612,...,-10.207612,-10.207612,1.026816e+09,1.026816e+09,1.144440e+09,2014,2013,0.095213,,
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586,42.180366,1.000000,0.092594,...,0.092594,0.092594,1.024837e+09,1.024837e+09,1.026816e+09,2014,2013,1.020496,,
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586,39.574629,1.000000,-6.177606,...,-6.177606,-6.177606,9.615267e+08,9.615267e+08,1.024837e+09,2014,2013,1.579912,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992,3.237339,1.113108,-6.622517,...,-3.960845,-6.622517,8.305700e+07,1.605180e+08,8.648243e+07,2024,2023,,,
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992,3.019220,1.113108,,...,,,7.746096e+07,1.605180e+08,,2024,2023,,,
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992,0.252558,0.675052,,...,,,2.294501e+07,2.294501e+07,,2024,2023,,,
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992,1160.333318,0.011920,,...,,,1.896297e+09,1.896297e+09,,2024,2023,,,


Finally, drop all rows with null values.

In [57]:
m_returns = m_returns.dropna()

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,...,m_USD_ret,m_local_ret,USD_secval,USD_mktval,X_USD_mktval,datayear,datayear-1,m_mktret,beta,X_beta
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1004,01,2015-02,2015-02-27,AAR CORP,USD,1.0,39791000.0,29.40,1.642484,48.289041,1.000000,2.581996,...,2.581996,2.581996,1.169855e+09,1.169855e+09,1.140410e+09,2015,2014,5.471863,0.836725,1.410778
1004,01,2015-03,2015-03-31,AAR CORP,USD,1.0,39791000.0,30.70,1.642484,50.424271,1.000000,4.421769,...,4.421769,4.421769,1.221584e+09,1.221584e+09,1.169855e+09,2015,2014,-1.850297,0.596864,0.836725
1004,01,2015-04,2015-04-30,AAR CORP,USD,1.0,39661000.0,30.24,1.646443,49.788431,1.000000,-1.260980,...,-1.260980,-1.260980,1.199349e+09,1.199349e+09,1.221584e+09,2015,2014,2.539511,0.480904,0.596864
1004,01,2015-05,2015-05-29,AAR CORP,USD,1.0,39661000.0,29.54,1.646443,48.635921,1.000000,-2.314815,...,-2.314815,-2.314815,1.171586e+09,1.171586e+09,1.199349e+09,2015,2014,-0.121440,0.663839,0.480904
1004,01,2015-06,2015-06-30,AAR CORP,USD,1.0,39661000.0,31.87,1.646443,52.472133,1.000000,7.887610,...,7.887610,7.887610,1.263996e+09,1.263996e+09,1.171586e+09,2015,2014,-2.580281,0.140564,0.663839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361486,01W,2024-03,2024-03-29,FABRICA DE SCULE RASNOV SA,RON,1.0,5650039.0,3.60,1.102500,3.969000,0.217178,0.000000,...,-0.152682,0.000000,4.417436e+06,4.417436e+06,4.424191e+06,2024,2023,3.153459,0.452607,0.468541
361486,01W,2024-04,2024-04-30,FABRICA DE SCULE RASNOV SA,RON,1.0,5650039.0,3.60,1.102500,3.969000,0.214867,0.000000,...,-1.064424,0.000000,4.370415e+06,4.370415e+06,4.417436e+06,2024,2023,-3.654085,0.414786,0.452607
361486,01W,2024-05,2024-05-31,FABRICA DE SCULE RASNOV SA,RON,1.0,5650039.0,9.50,1.102500,10.473750,0.217907,163.888889,...,167.622508,163.888889,1.169622e+07,1.169622e+07,4.370415e+06,2024,2023,4.284219,2.441378,0.414786
361486,01W,2024-06,2024-06-28,FABRICA DE SCULE RASNOV SA,RON,1.0,5650039.0,12.20,1.102500,13.450500,0.215265,28.421053,...,26.864346,28.421053,1.483833e+07,1.483833e+07,1.169622e+07,2024,2023,2.087410,2.973729,2.441378


#### Optional QC

For this whole dataframe, the following should be noted:
- rolling beta computation causes many null values at the start of security entries
  - shifting to prevent look-ahead pushes in a further null
- the first security entry has null returns
- shifting market value to prevent look-ahead once again pushes in a null

In [58]:
m_returns.describe()

Unnamed: 0,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,USD_fxret,USD_ret,ret_mspan,m_USD_ret,m_local_ret,USD_secval,USD_mktval,X_USD_mktval,datayear,datayear-1,m_mktret,beta,X_beta
count,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0,2964769.0
mean,1.187213,2219655000.0,2599.763,6.353686,13754.97,0.4968647,54.88095,-0.08894104,54.81879,1.006416,47.17179,47.23473,5649694000.0,20262500000.0,20141400000.0,2019.528,2018.528,0.7647547,-2.856569,-2.902776
std,2.063503,85327510000.0,27379.65,3741.609,1832237.0,0.5038307,41024.91,16.06003,41026.12,0.5420143,40105.76,40104.52,28499170000.0,113246500000.0,112203300000.0,2.718561,2.718561,4.378971,3248.307,3247.685
min,5e-08,0.0,1e-06,1.0,1e-06,3.92946e-05,-99.99996,-99.69494,-99.99996,1.0,-99.99996,-99.99996,0.0,1.755,3.231,2015.0,2014.0,-13.18113,-1813294.0,-1813294.0
25%,1.0,39171990.0,5.0,1.030423,6.612383,0.03396126,-5.38797,-0.9282515,-5.78678,1.0,-5.781262,-5.383734,199380200.0,238842600.0,238858400.0,2017.0,2016.0,-2.043487,0.2741334,0.272427
50%,1.0,171977000.0,20.38,1.164562,26.68,0.1551776,0.0,0.0,-0.01934878,1.0,-0.01934101,0.0,793800900.0,1255716000.0,1253500000.0,2020.0,2019.0,1.239566,0.8358552,0.8371212
75%,1.0,806504300.0,115.6,1.576426,175.1996,1.0,5.714286,0.6462257,5.948656,1.0,5.941818,5.711588,2826342000.0,5490539000.0,5478155000.0,2022.0,2021.0,3.189458,1.502049,1.505038
max,290.0519,13330470000000.0,5808000.0,5310840.0,691892900.0,3.378142,59999900.0,24674.8,59999900.0,109.0,59999900.0,59999900.0,3405393000000.0,5942845000000.0,5942845000000.0,2024.0,2023.0,12.22895,646787.6,646787.6


We also observe extreme outliers for return, market value & beta.

In [1199]:
# na_returns_df.xs((140044, "01"))
# na_returns_df.xs((311798, "01"))

Thus, we create an instance will the null values and outlier rows filtered out.
- at the 0.1% level

In [257]:
# na_returns = na_returns[
#     (na_returns["m_USD_ret"].quantile(0.001) < na_returns["m_USD_ret"])
#     & (na_returns["m_USD_ret"] < na_returns["m_USD_ret"].quantile(0.999))
#     & (na_returns["beta"].quantile(0.001) < na_returns["beta"])
#     & (na_returns["beta"] < na_returns["beta"].quantile(0.999))
#     & (na_returns["USD_mktval"].quantile(0.001) < na_returns["USD_mktval"])
#     & (na_returns["USD_mktval"] < na_returns["USD_mktval"].quantile(0.999))
# ]

m_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,curcdm,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,...,m_USD_ret,m_local_ret,USD_secval,USD_mktval,X_USD_mktval,datayear,datayear-1,m_mktret,beta,X_beta
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1004,01,2014-01,2014-01-31,AAR CORP,USD,1.0,39600000.0,26.65,1.623944,43.278106,1.000000,,...,,,1.055340e+09,1.055340e+09,,2014,2013,,,
1004,01,2014-02,2014-02-28,AAR CORP,USD,1.0,39600000.0,28.90,1.623944,46.931980,1.000000,8.442777,...,8.442777,8.442777,1.144440e+09,1.144440e+09,1.055340e+09,2014,2013,4.694558,,
1004,01,2014-03,2014-03-31,AAR CORP,USD,1.0,39569000.0,25.95,1.623944,42.141346,1.000000,-10.207612,...,-10.207612,-10.207612,1.026816e+09,1.026816e+09,1.144440e+09,2014,2013,0.095213,,
1004,01,2014-04,2014-04-30,AAR CORP,USD,1.0,39569000.0,25.90,1.628586,42.180366,1.000000,0.092594,...,0.092594,0.092594,1.024837e+09,1.024837e+09,1.026816e+09,2014,2013,1.020496,,
1004,01,2014-05,2014-05-30,AAR CORP,USD,1.0,39569000.0,24.30,1.628586,39.574629,1.000000,-6.177606,...,-6.177606,-6.177606,9.615267e+08,9.615267e+08,1.024837e+09,2014,2013,1.579912,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362705,01W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.82,1.147992,3.237339,1.113108,-6.622517,...,-3.960845,-6.622517,8.305700e+07,1.605180e+08,8.648243e+07,2024,2023,,,
362705,02W,2024-08,2024-08-21,MISITANO & STRACUZZI S P A,EUR,1.0,26460000.0,2.63,1.147992,3.019220,1.113108,,...,,,7.746096e+07,1.605180e+08,,2024,2023,,,
362758,01W,2024-08,2024-08-21,NUREN GROUP LIMITED,AUD,1.0,154500000.0,0.22,1.147992,0.252558,0.675052,,...,,,2.294501e+07,2.294501e+07,,2024,2023,,,
362761,01W,2024-08,2024-08-21,AKUMS DRUGS AND PHARMA,INR,1.0,157393988.0,1010.75,1.147992,1160.333318,0.011920,,...,,,1.896297e+09,1.896297e+09,,2024,2023,,,


In [258]:
m_returns.describe()

Unnamed: 0,ajexm,cshom,prccm,trfm,adjclose,exratd_toUSD,local_ret,USD_fxret,USD_ret,ret_mspan,m_USD_ret,m_local_ret,USD_secval,USD_mktval,X_USD_mktval,datayear,datayear-1,m_mktret,beta,X_beta
count,3478493.0,3478493.0,3478493.0,3478493.0,3478493.0,3478493.0,3439982.0,3439982.0,3439982.0,3439982.0,3439982.0,3439982.0,3478493.0,3478493.0,3439982.0,3478493.0,3478493.0,3427583.0,3001259.0,2992752.0
mean,1.223254,2196598000.0,2579.741,999.255,90760.72,0.5046947,54.32607,-0.1176851,54.24129,1.008295,44.67695,44.76162,5538351000.0,19977340000.0,19895130000.0,2019.071,2018.071,0.7347236,-2.787099,-2.854971
std,2.604588,85838650000.0,27961.12,534989.4,41862560.0,0.5096464,38583.7,14.92885,38585.06,0.5631355,37450.16,37448.77,28161180000.0,113051200000.0,112171000000.0,3.032324,3.032324,4.22692,3228.652,3232.489
min,5e-08,0.0,1e-06,1.0,1e-06,3.92946e-05,-99.99996,-99.69494,-99.99996,1.0,-99.99996,-99.99996,0.0,1.755,1.755,2014.0,2013.0,-13.18113,-1813294.0,-1813294.0
25%,1.0,38366000.0,5.06,1.021916,6.551132,0.0356541,-5.373832,-0.9401676,-5.787336,1.0,-5.778898,-5.365912,196607000.0,235738100.0,236183900.0,2017.0,2016.0,-1.883493,0.2713983,0.2724096
50%,1.0,167245200.0,20.14,1.142766,25.50274,0.1572043,0.0,0.0,-0.04504465,1.0,-0.04496231,0.0,783822900.0,1221833000.0,1222880000.0,2019.0,2018.0,1.115131,0.8354287,0.8361603
75%,1.0,779146600.0,107.95,1.533917,160.1946,1.0,5.686126,0.6394226,5.870456,1.0,5.861166,5.676968,2750985000.0,5306462000.0,5309057000.0,2022.0,2021.0,3.122349,1.503432,1.503695
max,1112.137,13330470000000.0,5808000.0,291143200.0,28240890000.0,3.555697,59999900.0,24674.8,59999900.0,119.0,59999900.0,59999900.0,3448906000000.0,5942845000000.0,5942845000000.0,2024.0,2023.0,12.22895,646787.6,646787.6


: 

## Fundamentals Data

### NA & Global - Loading and Preliminary Inspection

**NA**

In [114]:
na_fundamentals_df = pd.read_csv(
    "na_fundamentals_2014to2024.csv", 
    parse_dates=["datadate"]
).rename(columns={"fyear": "fiscalyear"})

print(na_fundamentals_df.dtypes)
print(na_fundamentals_df.isnull().sum())
na_fundamentals_df

gvkey                  int64
datadate      datetime64[ns]
fiscalyear           float64
indfmt                object
consol                object
popsrc                object
datafmt               object
conm                  object
curcd                 object
at                   float64
ceq                  float64
oiadp                float64
revt                 float64
costat                object
dtype: object
gvkey            0
datadate         0
fiscalyear     102
indfmt           0
consol           0
popsrc           0
datafmt          0
conm             0
curcd          102
at             906
ceq           1018
oiadp         8322
revt           968
costat           0
dtype: int64


Unnamed: 0,gvkey,datadate,fiscalyear,indfmt,consol,popsrc,datafmt,conm,curcd,at,ceq,oiadp,revt,costat
0,1004,2015-05-31,2014.0,INDL,C,D,STD,AAR CORP,USD,1515.000,845.100,-8.600,1594.300,A
1,1004,2016-05-31,2015.0,INDL,C,D,STD,AAR CORP,USD,1442.100,865.800,66.100,1662.600,A
2,1004,2017-05-31,2016.0,INDL,C,D,STD,AAR CORP,USD,1504.100,914.200,77.200,1767.600,A
3,1004,2018-05-31,2017.0,INDL,C,D,STD,AAR CORP,USD,1524.700,936.300,86.000,1748.300,A
4,1004,2019-05-31,2018.0,INDL,C,D,STD,AAR CORP,USD,1517.200,905.900,110.700,2051.800,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55142,353444,2021-12-31,2021.0,INDL,C,D,STD,HALEON PLC,USD,46650.099,35687.306,2816.529,12924.885,A
55143,353444,2022-12-31,2022.0,INDL,C,D,STD,HALEON PLC,USD,41948.594,19677.222,2913.448,13082.804,A
55144,353444,2023-12-31,2023.0,INDL,C,D,STD,HALEON PLC,USD,43379.259,21152.723,3181.953,14396.488,A
55145,356128,2022-12-31,2022.0,INDL,C,D,STD,JOINT STOCK COMPANY KASPI KZ,USD,11072.847,1771.010,2160.348,2746.982,A


In [115]:
na_fiscalyear_null_mask = pd.isnull(na_fundamentals_df["fiscalyear"])

# fiscal year is the previous year to the reporting period end date if the 
# month is May or earlier, otherwise the fiscal year is taken as the same 
# year as the reporting period end date 
na_fundamentals_df.loc[
    na_fiscalyear_null_mask, "fiscalyear"
] = na_fundamentals_df.loc[
    na_fiscalyear_null_mask, "datadate"
].dt.year - (
    na_fundamentals_df.loc[
        na_fiscalyear_null_mask, "datadate"
    ].dt.month < 6
)
# now that there are no nulls, ensure that it is integer typed for 
# compatibility with the other datasets
na_fundamentals_df["fiscalyear"] = na_fundamentals_df["fiscalyear"].astype(int)

# review
print(na_fundamentals_df.dtypes)
print(na_fundamentals_df.isnull().sum())
na_fundamentals_df

gvkey                  int64
datadate      datetime64[ns]
fiscalyear             int64
indfmt                object
consol                object
popsrc                object
datafmt               object
conm                  object
curcd                 object
at                   float64
ceq                  float64
oiadp                float64
revt                 float64
costat                object
dtype: object
gvkey            0
datadate         0
fiscalyear       0
indfmt           0
consol           0
popsrc           0
datafmt          0
conm             0
curcd          102
at             906
ceq           1018
oiadp         8322
revt           968
costat           0
dtype: int64


Unnamed: 0,gvkey,datadate,fiscalyear,indfmt,consol,popsrc,datafmt,conm,curcd,at,ceq,oiadp,revt,costat
0,1004,2015-05-31,2014,INDL,C,D,STD,AAR CORP,USD,1515.000,845.100,-8.600,1594.300,A
1,1004,2016-05-31,2015,INDL,C,D,STD,AAR CORP,USD,1442.100,865.800,66.100,1662.600,A
2,1004,2017-05-31,2016,INDL,C,D,STD,AAR CORP,USD,1504.100,914.200,77.200,1767.600,A
3,1004,2018-05-31,2017,INDL,C,D,STD,AAR CORP,USD,1524.700,936.300,86.000,1748.300,A
4,1004,2019-05-31,2018,INDL,C,D,STD,AAR CORP,USD,1517.200,905.900,110.700,2051.800,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55142,353444,2021-12-31,2021,INDL,C,D,STD,HALEON PLC,USD,46650.099,35687.306,2816.529,12924.885,A
55143,353444,2022-12-31,2022,INDL,C,D,STD,HALEON PLC,USD,41948.594,19677.222,2913.448,13082.804,A
55144,353444,2023-12-31,2023,INDL,C,D,STD,HALEON PLC,USD,43379.259,21152.723,3181.953,14396.488,A
55145,356128,2022-12-31,2022,INDL,C,D,STD,JOINT STOCK COMPANY KASPI KZ,USD,11072.847,1771.010,2160.348,2746.982,A


Companies, identified by gvkeys, can have multiple entries per fiscal year in the 
raw downloaded data. 

This is e.g. because of different 
reporting formats (`indfmt`).

 - Consolidate information into single gvkey-fiscalyear entries.

This will reduce the number of null values.

In [116]:
na_fundamentals_df = na_fundamentals_df.groupby(
    ["gvkey", "fiscalyear"]
).first()
na_fundamentals_df = na_fundamentals_df.sort_index()

# review
print(na_fundamentals_df.dtypes)
print(na_fundamentals_df.isnull().sum())
na_fundamentals_df

datadate    datetime64[ns]
indfmt              object
consol              object
popsrc              object
datafmt             object
conm                object
curcd               object
at                 float64
ceq                float64
oiadp              float64
revt               float64
costat              object
dtype: object
datadate       0
indfmt         0
consol         0
popsrc         0
datafmt        0
conm           0
curcd         99
at           902
ceq         1001
oiadp        961
revt         961
costat         0
dtype: int64


Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,indfmt,consol,popsrc,datafmt,conm,curcd,at,ceq,oiadp,revt,costat
gvkey,fiscalyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1004,2014,2015-05-31,INDL,C,D,STD,AAR CORP,USD,1515.000,845.100,-8.600,1594.300,A
1004,2015,2016-05-31,INDL,C,D,STD,AAR CORP,USD,1442.100,865.800,66.100,1662.600,A
1004,2016,2017-05-31,INDL,C,D,STD,AAR CORP,USD,1504.100,914.200,77.200,1767.600,A
1004,2017,2018-05-31,INDL,C,D,STD,AAR CORP,USD,1524.700,936.300,86.000,1748.300,A
1004,2018,2019-05-31,INDL,C,D,STD,AAR CORP,USD,1517.200,905.900,110.700,2051.800,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...
353444,2021,2021-12-31,INDL,C,D,STD,HALEON PLC,USD,46650.099,35687.306,2816.529,12924.885,A
353444,2022,2022-12-31,INDL,C,D,STD,HALEON PLC,USD,41948.594,19677.222,2913.448,13082.804,A
353444,2023,2023-12-31,INDL,C,D,STD,HALEON PLC,USD,43379.259,21152.723,3181.953,14396.488,A
356128,2022,2022-12-31,INDL,C,D,STD,JOINT STOCK COMPANY KASPI KZ,USD,11072.847,1771.010,2160.348,2746.982,A


Finally, drop the rows which still have any null values.

In [117]:
na_fundamentals_df = na_fundamentals_df.dropna()

na_fundamentals_df

Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,indfmt,consol,popsrc,datafmt,conm,curcd,at,ceq,oiadp,revt,costat
gvkey,fiscalyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1004,2014,2015-05-31,INDL,C,D,STD,AAR CORP,USD,1515.000,845.100,-8.600,1594.300,A
1004,2015,2016-05-31,INDL,C,D,STD,AAR CORP,USD,1442.100,865.800,66.100,1662.600,A
1004,2016,2017-05-31,INDL,C,D,STD,AAR CORP,USD,1504.100,914.200,77.200,1767.600,A
1004,2017,2018-05-31,INDL,C,D,STD,AAR CORP,USD,1524.700,936.300,86.000,1748.300,A
1004,2018,2019-05-31,INDL,C,D,STD,AAR CORP,USD,1517.200,905.900,110.700,2051.800,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...
353444,2021,2021-12-31,INDL,C,D,STD,HALEON PLC,USD,46650.099,35687.306,2816.529,12924.885,A
353444,2022,2022-12-31,INDL,C,D,STD,HALEON PLC,USD,41948.594,19677.222,2913.448,13082.804,A
353444,2023,2023-12-31,INDL,C,D,STD,HALEON PLC,USD,43379.259,21152.723,3181.953,14396.488,A
356128,2022,2022-12-31,INDL,C,D,STD,JOINT STOCK COMPANY KASPI KZ,USD,11072.847,1771.010,2160.348,2746.982,A


**Global**

In [123]:
global_fundamentals_df = pd.read_csv(
    "global_fundamentals_2014to2024.csv", 
    parse_dates=["datadate"]
).rename(columns={"fyear": "fiscalyear"})

global_fundamentals_df

Unnamed: 0,gvkey,curcd,fiscalyear,datadate,at,ceq,oiadp,revt,conm
0,1166,EUR,2014.0,2014-12-31,1826.933,1690.200,68.098,545.604,ASM INTERNATIONAL NV
1,1166,EUR,2015.0,2015-12-31,2075.977,1948.379,101.776,669.621,ASM INTERNATIONAL NV
2,1166,EUR,2016.0,2016-12-31,2148.263,2015.856,63.470,597.930,ASM INTERNATIONAL NV
3,1166,EUR,2017.0,2017-12-31,2177.202,2011.512,95.495,737.401,ASM INTERNATIONAL NV
4,1166,EUR,2018.0,2018-12-31,1847.972,1641.551,113.390,818.081,ASM INTERNATIONAL NV
...,...,...,...,...,...,...,...,...,...
172550,362282,INR,2018.0,2019-03-31,41.185,-32.446,20.168,81.469,DIENSTEN TECH LIMITED
172551,362282,INR,2019.0,2020-03-31,33.762,-22.880,17.603,56.199,DIENSTEN TECH LIMITED
172552,362282,INR,2020.0,2021-03-31,37.035,-11.173,20.016,52.320,DIENSTEN TECH LIMITED
172553,362282,INR,2021.0,2022-03-31,20.411,13.999,1.870,7.241,DIENSTEN TECH LIMITED


Same processing as for NA.

In [124]:
global_fiscalyear_null_mask = pd.isnull(global_fundamentals_df["fiscalyear"])

# fiscal year is the previous year to the reporting period end date if the 
# month is May or earlier, otherwise the fiscal year is taken as the same 
# year as the reporting period end date 
global_fundamentals_df.loc[
    global_fiscalyear_null_mask, "fiscalyear"
] = global_fundamentals_df.loc[
    global_fiscalyear_null_mask, "datadate"
].dt.year - (
    global_fundamentals_df.loc[
        global_fiscalyear_null_mask, "datadate"
    ].dt.month < 6
)
# now that there are no nulls, ensure that it is integer typed for 
# compatibility with the other datasets
global_fundamentals_df["fiscalyear"] = global_fundamentals_df["fiscalyear"].astype(int)

global_fundamentals_df = global_fundamentals_df.groupby(
    ["gvkey", "fiscalyear"]
).first()
global_fundamentals_df = global_fundamentals_df.sort_index()
global_fundamentals_df = global_fundamentals_df.dropna()

global_fundamentals_df

Unnamed: 0_level_0,Unnamed: 1_level_0,curcd,datadate,at,ceq,oiadp,revt,conm
gvkey,fiscalyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1166,2014,EUR,2014-12-31,1826.933,1690.200,68.098,545.604,ASM INTERNATIONAL NV
1166,2015,EUR,2015-12-31,2075.977,1948.379,101.776,669.621,ASM INTERNATIONAL NV
1166,2016,EUR,2016-12-31,2148.263,2015.856,63.470,597.930,ASM INTERNATIONAL NV
1166,2017,EUR,2017-12-31,2177.202,2011.512,95.495,737.401,ASM INTERNATIONAL NV
1166,2018,EUR,2018-12-31,1847.972,1641.551,113.390,818.081,ASM INTERNATIONAL NV
...,...,...,...,...,...,...,...,...
362282,2018,INR,2019-03-31,41.185,-32.446,20.168,81.469,DIENSTEN TECH LIMITED
362282,2019,INR,2020-03-31,33.762,-22.880,17.603,56.199,DIENSTEN TECH LIMITED
362282,2020,INR,2021-03-31,37.035,-11.173,20.016,52.320,DIENSTEN TECH LIMITED
362282,2021,INR,2022-03-31,20.411,13.999,1.870,7.241,DIENSTEN TECH LIMITED


### NA & Global - Data Preparation

Combine both datasets into one authorative fundamentals data source.

In [125]:
fundamentals_df = pd.concat(
    [na_fundamentals_df.reset_index(), global_fundamentals_df.reset_index()], 
    ignore_index=True, 
).groupby(
    ["gvkey", "fiscalyear"]
).first().sort_index()

fundamentals_df

Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,indfmt,consol,popsrc,datafmt,conm,curcd,at,ceq,oiadp,revt,costat
gvkey,fiscalyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1004,2014,2015-05-31,INDL,C,D,STD,AAR CORP,USD,1515.000,845.100,-8.600,1594.300,A
1004,2015,2016-05-31,INDL,C,D,STD,AAR CORP,USD,1442.100,865.800,66.100,1662.600,A
1004,2016,2017-05-31,INDL,C,D,STD,AAR CORP,USD,1504.100,914.200,77.200,1767.600,A
1004,2017,2018-05-31,INDL,C,D,STD,AAR CORP,USD,1524.700,936.300,86.000,1748.300,A
1004,2018,2019-05-31,INDL,C,D,STD,AAR CORP,USD,1517.200,905.900,110.700,2051.800,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...
362282,2018,2019-03-31,,,,,DIENSTEN TECH LIMITED,INR,41.185,-32.446,20.168,81.469,
362282,2019,2020-03-31,,,,,DIENSTEN TECH LIMITED,INR,33.762,-22.880,17.603,56.199,
362282,2020,2021-03-31,,,,,DIENSTEN TECH LIMITED,INR,37.035,-11.173,20.016,52.320,
362282,2021,2022-03-31,,,,,DIENSTEN TECH LIMITED,INR,20.411,13.999,1.870,7.241,


Join with the exchange rates to USD in order to standardise the values to USD.
- Last for balance sheet
- Average for income statement

In [1575]:
m_exrts_flat = m_exrts.reset_index()
m_exrts_flat["datayear"] = m_exrts_flat["data_ym"].dt.year

m_exrts_flat

Unnamed: 0,curd,data_ym,exratd_toGBP,exratd_toUSD,datayear
0,AED,2021-01,0.198491,0.272251,2021
1,AED,2021-02,0.195503,0.272258,2021
2,AED,2021-03,0.197625,0.272248,2021
3,AED,2021-04,0.197083,0.272251,2021
4,AED,2021-05,0.191637,0.272259,2021
...,...,...,...,...,...
6752,ZWL,2024-04,0.002350,0.002944,2024
6753,ZWL,2024-05,0.002350,0.002991,2024
6754,ZWL,2024-06,0.002350,0.002970,2024
6755,ZWL,2024-07,0.002350,0.003018,2024


In [1576]:
y_bs_exrt = m_exrts_flat.drop(columns="data_ym").groupby(
    ["curd", "datayear"]
).last()[["exratd_toUSD"]].rename(
    columns={"exratd_toUSD": "bs_toUSD"}
)

y_bs_exrt

Unnamed: 0_level_0,Unnamed: 1_level_0,bs_toUSD
curd,datayear,Unnamed: 2_level_1
AED,2021,0.272263
AED,2022,0.272257
AED,2023,0.272249
AED,2024,0.272248
AFN,2021,0.009641
...,...,...
ZMW,2024,0.038023
ZWL,2021,0.003183
ZWL,2022,0.002832
ZWL,2023,0.002994


In [1577]:
y_is_exrt = m_exrts_flat.drop(columns="data_ym").groupby(
    ["curd", "datayear"]
).mean()[["exratd_toUSD"]].rename(columns={"exratd_toUSD": "is_toUSD"})

y_is_exrt

Unnamed: 0_level_0,Unnamed: 1_level_0,is_toUSD
curd,datayear,Unnamed: 2_level_1
AED,2021,0.272250
AED,2022,0.272247
AED,2023,0.272256
AED,2024,0.272254
AFN,2021,0.011940
...,...,...
ZMW,2024,0.039096
ZWL,2021,0.003230
ZWL,2022,0.002893
ZWL,2023,0.002933


In [1578]:
y_fs_exrt = pd.concat(
    [y_bs_exrt, y_is_exrt], 
    axis=1
)

y_fs_exrt

Unnamed: 0_level_0,Unnamed: 1_level_0,bs_toUSD,is_toUSD
curd,datayear,Unnamed: 2_level_1,Unnamed: 3_level_1
AED,2021,0.272263,0.272250
AED,2022,0.272257,0.272247
AED,2023,0.272249,0.272256
AED,2024,0.272248,0.272254
AFN,2021,0.009641,0.011940
...,...,...,...
ZMW,2024,0.038023,0.039096
ZWL,2021,0.003183,0.003230
ZWL,2022,0.002832,0.002893
ZWL,2023,0.002994,0.002933


In [1579]:
fundamentals_df = fundamentals_df.reset_index().merge(
    y_fs_exrt, 
    how="left", 
    left_on=["curcd", "fyear"], 
    right_index=True, 
).set_index(
    ["gvkey", "fyear"]
)

fundamentals_df

Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,indfmt,consol,popsrc,datafmt,conm,curcd,at,ceq,oiadp,revt,costat,bs_toUSD,is_toUSD
gvkey,fyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
100080,2021,2021-12-31,INDL,C,D,STD,BAYER AG,USD,136753.182,37554.496,8084.110,50134.455,A,1.0,1.0
100080,2022,2022-12-31,INDL,C,D,STD,BAYER AG,USD,133461.325,41438.343,9863.422,54226.913,A,1.0,1.0
100080,2023,2023-12-31,INDL,C,D,STD,BAYER AG,USD,128506.347,36395.707,7641.253,52655.337,A,1.0,1.0
100091,2021,2021-12-31,INDL,C,D,STD,RENTOKIL INITIAL PLC,USD,5854.000,1712.000,497.000,4004.000,A,1.0,1.0
100091,2022,2022-12-31,INDL,C,D,STD,RENTOKIL INITIAL PLC,USD,14365.000,4939.000,605.000,4475.000,A,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351590,2022,2022-12-31,INDL,C,D,STD,DAIMLER TRUCK HOLDING AG,USD,68366.372,21430.419,3730.979,54322.030,A,1.0,1.0
351590,2023,2023-12-31,INDL,C,D,STD,DAIMLER TRUCK HOLDING AG,USD,78713.854,23880.987,5579.784,61777.753,A,1.0,1.0
353444,2021,2021-12-31,INDL,C,D,STD,HALEON PLC,USD,46650.099,35687.306,2816.529,12924.885,A,1.0,1.0
353444,2022,2022-12-31,INDL,C,D,STD,HALEON PLC,USD,41948.594,19677.222,2913.448,13082.804,A,1.0,1.0


Apply the exchange rates.

In [1580]:
fundamentals_df["at"] *= fundamentals_df["bs_toUSD"]
fundamentals_df["ceq"] *= fundamentals_df["bs_toUSD"]

fundamentals_df["oiadp"] *= fundamentals_df["is_toUSD"]
fundamentals_df["revt"] *= fundamentals_df["is_toUSD"]

Compute investment by percentage change in assets.

In [1581]:
fundamentals_df["investment"] = fundamentals_df.groupby(
    level="gvkey"
)["at"].pct_change()

fundamentals_df

Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,indfmt,consol,popsrc,datafmt,conm,curcd,at,ceq,oiadp,revt,costat,bs_toUSD,is_toUSD,investment
gvkey,fyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
100080,2021,2021-12-31,INDL,C,D,STD,BAYER AG,USD,136753.182,37554.496,8084.110,50134.455,A,1.0,1.0,
100080,2022,2022-12-31,INDL,C,D,STD,BAYER AG,USD,133461.325,41438.343,9863.422,54226.913,A,1.0,1.0,-0.024072
100080,2023,2023-12-31,INDL,C,D,STD,BAYER AG,USD,128506.347,36395.707,7641.253,52655.337,A,1.0,1.0,-0.037127
100091,2021,2021-12-31,INDL,C,D,STD,RENTOKIL INITIAL PLC,USD,5854.000,1712.000,497.000,4004.000,A,1.0,1.0,
100091,2022,2022-12-31,INDL,C,D,STD,RENTOKIL INITIAL PLC,USD,14365.000,4939.000,605.000,4475.000,A,1.0,1.0,1.453878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351590,2022,2022-12-31,INDL,C,D,STD,DAIMLER TRUCK HOLDING AG,USD,68366.372,21430.419,3730.979,54322.030,A,1.0,1.0,0.096925
351590,2023,2023-12-31,INDL,C,D,STD,DAIMLER TRUCK HOLDING AG,USD,78713.854,23880.987,5579.784,61777.753,A,1.0,1.0,0.151353
353444,2021,2021-12-31,INDL,C,D,STD,HALEON PLC,USD,46650.099,35687.306,2816.529,12924.885,A,1.0,1.0,
353444,2022,2022-12-31,INDL,C,D,STD,HALEON PLC,USD,41948.594,19677.222,2913.448,13082.804,A,1.0,1.0,-0.100782


In [1582]:
len(fundamentals_df.dropna())

2298

Compute operating profitability.

In [1583]:
fundamentals_df["opm"] = fundamentals_df["oiadp"] / fundamentals_df["revt"]
# np.clip(np.nan_to_num(
#     , 
#     posinf=0, 
#     neginf=0
# ), a_min=0, a_max=None)

fundamentals_df

Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,indfmt,consol,popsrc,datafmt,conm,curcd,at,ceq,oiadp,revt,costat,bs_toUSD,is_toUSD,investment,opm
gvkey,fyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
100080,2021,2021-12-31,INDL,C,D,STD,BAYER AG,USD,136753.182,37554.496,8084.110,50134.455,A,1.0,1.0,,0.161249
100080,2022,2022-12-31,INDL,C,D,STD,BAYER AG,USD,133461.325,41438.343,9863.422,54226.913,A,1.0,1.0,-0.024072,0.181892
100080,2023,2023-12-31,INDL,C,D,STD,BAYER AG,USD,128506.347,36395.707,7641.253,52655.337,A,1.0,1.0,-0.037127,0.145118
100091,2021,2021-12-31,INDL,C,D,STD,RENTOKIL INITIAL PLC,USD,5854.000,1712.000,497.000,4004.000,A,1.0,1.0,,0.124126
100091,2022,2022-12-31,INDL,C,D,STD,RENTOKIL INITIAL PLC,USD,14365.000,4939.000,605.000,4475.000,A,1.0,1.0,1.453878,0.135196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351590,2022,2022-12-31,INDL,C,D,STD,DAIMLER TRUCK HOLDING AG,USD,68366.372,21430.419,3730.979,54322.030,A,1.0,1.0,0.096925,0.068683
351590,2023,2023-12-31,INDL,C,D,STD,DAIMLER TRUCK HOLDING AG,USD,78713.854,23880.987,5579.784,61777.753,A,1.0,1.0,0.151353,0.090320
353444,2021,2021-12-31,INDL,C,D,STD,HALEON PLC,USD,46650.099,35687.306,2816.529,12924.885,A,1.0,1.0,,0.217915
353444,2022,2022-12-31,INDL,C,D,STD,HALEON PLC,USD,41948.594,19677.222,2913.448,13082.804,A,1.0,1.0,-0.100782,0.222693


Finally, drop null rows so that the table is ready for linking.

In [1584]:
with pd.option_context('mode.use_inf_as_null', True):
    fundamentals_df = fundamentals_df.dropna()

fundamentals_df

Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,indfmt,consol,popsrc,datafmt,conm,curcd,at,ceq,oiadp,revt,costat,bs_toUSD,is_toUSD,investment,opm
gvkey,fyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
100080,2022,2022-12-31,INDL,C,D,STD,BAYER AG,USD,133461.325,41438.343,9863.422,54226.913,A,1.0,1.0,-0.024072,0.181892
100080,2023,2023-12-31,INDL,C,D,STD,BAYER AG,USD,128506.347,36395.707,7641.253,52655.337,A,1.0,1.0,-0.037127,0.145118
100091,2022,2022-12-31,INDL,C,D,STD,RENTOKIL INITIAL PLC,USD,14365.000,4939.000,605.000,4475.000,A,1.0,1.0,1.453878,0.135196
100091,2023,2023-12-31,INDL,C,D,STD,RENTOKIL INITIAL PLC,USD,14174.000,5209.000,971.000,6847.000,A,1.0,1.0,-0.013296,0.141814
100095,2022,2022-12-31,INDL,C,D,STD,BUNZL PLC,USD,10442.627,3278.412,920.785,14506.394,A,1.0,1.0,0.084024,0.063474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351491,2023,2023-12-31,INDL,C,D,STD,IVECO GROUP N V,USD,20321.775,2600.878,1158.402,17920.964,A,1.0,1.0,0.187452,0.064639
351590,2022,2022-12-31,INDL,C,D,STD,DAIMLER TRUCK HOLDING AG,USD,68366.372,21430.419,3730.979,54322.030,A,1.0,1.0,0.096925,0.068683
351590,2023,2023-12-31,INDL,C,D,STD,DAIMLER TRUCK HOLDING AG,USD,78713.854,23880.987,5579.784,61777.753,A,1.0,1.0,0.151353,0.090320
353444,2022,2022-12-31,INDL,C,D,STD,HALEON PLC,USD,41948.594,19677.222,2913.448,13082.804,A,1.0,1.0,-0.100782,0.222693


In [1645]:
fundamentals_df

Unnamed: 0_level_0,Unnamed: 1_level_0,ceq,opm,investment
gvkey,fiscalyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100080,2022,41438.343,0.181892,-0.024072
100080,2023,36395.707,0.145118,-0.037127
100091,2022,4939.000,0.135196,1.453878
100091,2023,5209.000,0.141814,-0.013296
100095,2022,3278.412,0.063474,0.084024
...,...,...,...,...
351491,2023,2600.878,0.064639,0.187452
351590,2022,21430.419,0.068683,0.096925
351590,2023,23880.987,0.090320,0.151353
353444,2022,19677.222,0.222693,-0.100782


In [None]:
fundamentals_df = fundamentals_df[
    ["ceq", "opm", "investment"]
]
fundamentals_df.index = fundamentals_df.index.rename(["gvkey", "fiscalyear"])

In [None]:
fundamentals_df

snippets

In [1530]:
(fundamentals_df["opm"] == 0).sum()

0

## Linking

First the two lagged, annual frequency datasets for emissions and fundamentals.

In [1616]:
emissions_df

Unnamed: 0_level_0,Unnamed: 1_level_0,di_319413,di_319414,di_319415,country
gvkey,fiscalyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
122594,2022,18853.165084,72589.700681,1.323541e+06,United States
122594,2023,23250.617734,89521.063147,1.632254e+06,United States
122554,2022,16736.521444,51330.155513,7.097935e+05,United States
122554,2023,19585.900250,60069.071644,8.306352e+05,United States
63734,2022,1027.993067,3152.808316,4.359704e+04,United States
...,...,...,...,...,...
361808,2022,489.910950,1022.660743,6.379460e+03,Malaysia
358709,2022,291.908186,183.331947,5.197957e+02,Italy
358653,2022,12.539324,0.790235,6.400957e+00,Japan
359029,2022,2.811856,0.177205,1.435370e+00,Japan


In [None]:
fundamentals_df

In [1646]:
na_returns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,ajexm,curcdm,prccm,trfm,cshom,adjclose,exratd_toGBP,exratd_toUSD,...,m_USD_ret,m_GBP_ret,m_local_ret,local_mktval,USD_mktval,GBP_mktval,datayear,datayear-1,m_mktret,beta
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
100001,91,2023-01,2023-01-31,ACCOR SA,1.0,USD,6.466,2.1735,2.630320e+08,14.053851,0.811820,1.0,...,29.735152,26.902001,29.735152,1.310951e+09,1.310951e+09,1.088017e+09,2023,2022,7.073509,0.752327
100001,91,2023-02,2023-02-28,ACCOR SA,1.0,USD,6.677,2.1735,2.630320e+08,14.512460,0.825355,1.0,...,3.263223,4.984845,3.263223,1.700765e+09,1.700765e+09,1.380715e+09,2023,2022,-2.994120,1.278480
100001,91,2023-03,2023-03-31,ACCOR SA,1.0,USD,6.512,2.1735,2.630320e+08,14.153832,0.808342,1.0,...,-2.471170,-4.481504,-2.471170,1.756265e+09,1.756265e+09,1.449542e+09,2023,2022,3.086278,1.235725
100001,91,2023-04,2023-04-30,ACCOR SA,1.0,USD,7.092,2.1735,2.630320e+08,15.414462,0.795418,1.0,...,8.906634,7.165444,8.906634,1.712864e+09,1.712864e+09,1.384580e+09,2023,2022,1.508235,1.236422
100001,91,2023-05,2023-05-31,ACCOR SA,1.0,USD,6.603,2.2475,2.648900e+08,14.840243,0.807298,1.0,...,-3.725200,-2.287334,-3.725200,1.865423e+09,1.865423e+09,1.483792e+09,2023,2022,-1.175735,1.464333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353444,90,2024-03,2024-03-31,HALEON PLC,1.0,USD,8.490,1.0253,9.132301e+09,8.704797,0.791766,1.0,...,0.416267,0.440119,0.416267,7.904795e+10,7.904795e+10,6.257259e+10,2024,2023,3.153459,0.736227
353444,90,2024-04,2024-04-30,HALEON PLC,1.0,USD,8.530,1.0253,9.132301e+09,8.745809,0.798403,1.0,...,0.471143,1.313416,0.471143,7.753324e+10,7.753324e+10,6.138815e+10,2024,2023,-3.654085,0.707596
353444,90,2024-05,2024-05-31,HALEON PLC,1.0,USD,8.440,1.0253,9.132301e+09,8.653532,0.785916,1.0,...,-1.055100,-2.602572,-1.055100,7.789853e+10,7.789853e+10,6.219443e+10,2024,2023,4.284219,0.609430
353444,90,2024-06,2024-06-30,HALEON PLC,1.0,USD,8.260,1.0253,9.132301e+09,8.468978,0.791264,1.0,...,-2.132701,-1.466727,-2.132701,7.707662e+10,7.707662e+10,6.057578e+10,2024,2023,2.087410,0.461021


In [1647]:
reg_df = na_returns.join(
    emissions_df.join(fundamentals_df).dropna(), 
    on=["gvkey", "datayear-1"]
).dropna()

reg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datadate,conm,ajexm,curcdm,prccm,trfm,cshom,adjclose,exratd_toGBP,exratd_toUSD,...,di_319413,di_319414,di_319415,companyid,companyname,country,periodend_ym,ceq,opm,investment
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
100080,90,2023-01,2023-01-31,BAYER AG,1.0,USD,15.476,2.2879,9.824240e+08,35.407540,0.811820,1.0,...,1903000.0,1560000.0,6666060.654,103375,Bayer Aktiengesellschaft,Germany,2022-12,41438.343,0.181892,-0.024072
100080,90,2023-02,2023-02-28,BAYER AG,1.0,USD,14.924,2.2879,9.824240e+08,34.144620,0.825355,1.0,...,1903000.0,1560000.0,6666060.654,103375,Bayer Aktiengesellschaft,Germany,2022-12,41438.343,0.181892,-0.024072
100080,90,2023-03,2023-03-31,BAYER AG,1.0,USD,15.949,2.2879,9.824240e+08,36.489717,0.808342,1.0,...,1903000.0,1560000.0,6666060.654,103375,Bayer Aktiengesellschaft,Germany,2022-12,41438.343,0.181892,-0.024072
100080,90,2023-04,2023-04-30,BAYER AG,1.0,USD,16.494,2.2879,9.824240e+08,37.736623,0.795418,1.0,...,1903000.0,1560000.0,6666060.654,103375,Bayer Aktiengesellschaft,Germany,2022-12,41438.343,0.181892,-0.024072
100080,90,2023-05,2023-05-31,BAYER AG,1.0,USD,13.897,2.3966,9.824240e+08,33.305550,0.807298,1.0,...,1903000.0,1560000.0,6666060.654,103375,Bayer Aktiengesellschaft,Germany,2022-12,41438.343,0.181892,-0.024072
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353444,90,2024-03,2024-03-31,HALEON PLC,1.0,USD,8.490,1.0253,9.132301e+09,8.704797,0.791766,1.0,...,60000.0,142000.0,1183620.035,1671690919,Haleon plc,United Kingdom,2023-12,21152.723,0.221023,0.034105
353444,90,2024-04,2024-04-30,HALEON PLC,1.0,USD,8.530,1.0253,9.132301e+09,8.745809,0.798403,1.0,...,60000.0,142000.0,1183620.035,1671690919,Haleon plc,United Kingdom,2023-12,21152.723,0.221023,0.034105
353444,90,2024-05,2024-05-31,HALEON PLC,1.0,USD,8.440,1.0253,9.132301e+09,8.653532,0.785916,1.0,...,60000.0,142000.0,1183620.035,1671690919,Haleon plc,United Kingdom,2023-12,21152.723,0.221023,0.034105
353444,90,2024-06,2024-06-30,HALEON PLC,1.0,USD,8.260,1.0253,9.132301e+09,8.468978,0.791264,1.0,...,60000.0,142000.0,1183620.035,1671690919,Haleon plc,United Kingdom,2023-12,21152.723,0.221023,0.034105


In [1648]:
fm_reg_df = reg_df[
    ["beta", "USD_mktval", "ceq", "opm", "investment", "di_319413", "di_319414", "di_319415", "m_USD_ret"]
]

fm_reg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,beta,USD_mktval,ceq,opm,investment,di_319413,di_319414,di_319415,m_USD_ret
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100080,90,2023-01,1.203150,1.266738e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,20.024818
100080,90,2023-02,1.481416,1.520399e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,-3.566813
100080,90,2023-03,1.477962,1.466170e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,6.868132
100080,90,2023-04,1.399726,1.566868e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,3.417142
100080,90,2023-05,1.525930,1.620410e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,-11.742101
...,...,...,...,...,...,...,...,...,...,...,...
353444,90,2024-03,0.736227,7.904795e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,0.416267
353444,90,2024-04,0.707596,7.753324e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,0.471143
353444,90,2024-05,0.609430,7.789853e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,-1.055100
353444,90,2024-06,0.461021,7.707662e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,-2.132701


## Fama-Macbeth Regression

In [1649]:
fm_reg_df = fm_reg_df.copy()

fm_reg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,beta,USD_mktval,ceq,opm,investment,di_319413,di_319414,di_319415,m_USD_ret
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100080,90,2023-01,1.203150,1.266738e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,20.024818
100080,90,2023-02,1.481416,1.520399e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,-3.566813
100080,90,2023-03,1.477962,1.466170e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,6.868132
100080,90,2023-04,1.399726,1.566868e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,3.417142
100080,90,2023-05,1.525930,1.620410e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,-11.742101
...,...,...,...,...,...,...,...,...,...,...,...
353444,90,2024-03,0.736227,7.904795e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,0.416267
353444,90,2024-04,0.707596,7.753324e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,0.471143
353444,90,2024-05,0.609430,7.789853e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,-1.055100
353444,90,2024-06,0.461021,7.707662e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,-2.132701


In [1650]:
fm_reg_df["B/M"] = fm_reg_df["ceq"] / fm_reg_df["USD_mktval"]

fm_reg_df.dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,beta,USD_mktval,ceq,opm,investment,di_319413,di_319414,di_319415,m_USD_ret,B/M
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
100080,90,2023-01,1.203150,1.266738e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,20.024818,3.271265e-06
100080,90,2023-02,1.481416,1.520399e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,-3.566813,2.725491e-06
100080,90,2023-03,1.477962,1.466170e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,6.868132,2.826299e-06
100080,90,2023-04,1.399726,1.566868e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,3.417142,2.644661e-06
100080,90,2023-05,1.525930,1.620410e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,-11.742101,2.557275e-06
...,...,...,...,...,...,...,...,...,...,...,...,...
353444,90,2024-03,0.736227,7.904795e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,0.416267,2.675936e-07
353444,90,2024-04,0.707596,7.753324e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,0.471143,2.728214e-07
353444,90,2024-05,0.609430,7.789853e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,-1.055100,2.715420e-07
353444,90,2024-06,0.461021,7.707662e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,-2.132701,2.744376e-07


In [1623]:
fm_reg_df["ln(ME)"] = np.log(fm_reg_df["USD_mktval"])

fm_reg_df.dropna()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,beta,USD_mktval,ceq,opm,investment,di_319413,di_319414,di_319415,m_USD_ret,B/M,ln(ME)
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100080,90,2023-01,1.203150,1.266738e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,20.024818,3.271265e-06,23.262296
100080,90,2023-02,1.481416,1.520399e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,-3.566813,2.725491e-06,23.444824
100080,90,2023-03,1.477962,1.466170e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,6.868132,2.826299e-06,23.408504
100080,90,2023-04,1.399726,1.566868e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,3.417142,2.644661e-06,23.474930
100080,90,2023-05,1.525930,1.620410e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,-11.742101,2.557275e-06,23.508530
...,...,...,...,...,...,...,...,...,...,...,...,...,...
353444,90,2024-03,0.736227,7.904795e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,0.416267,2.675936e-07,25.093321
353444,90,2024-04,0.707596,7.753324e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,0.471143,2.728214e-07,25.073973
353444,90,2024-05,0.609430,7.789853e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,-1.055100,2.715420e-07,25.078673
353444,90,2024-06,0.461021,7.707662e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,-2.132701,2.744376e-07,25.068066


In [1553]:
fm_reg_df["ln(GHGS1)"] = np.log(np.clip(fm_reg_df["di_319413"], 1e-12, None))
fm_reg_df["ln(GHGS2)"] = np.log(np.clip(fm_reg_df["di_319414"], 1e-12, None))
fm_reg_df["ln(GHGS3)"] = np.log(np.clip(fm_reg_df["di_319415"], 1e-12, None))

fm_reg_df.dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,beta,USD_mktval,ceq,opm,investment,di_319413,di_319414,di_319415,m_USD_ret,B/M,ln(ME),ln(GHGS1),ln(GHGS2),ln(GHGS3)
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
100080,90,2023-01,1.203150,1.266738e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,0.200248,3.271265e-06,23.262296,14.458942,14.260196,15.712540
100080,90,2023-02,1.481416,1.520399e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,-0.035668,2.725491e-06,23.444824,14.458942,14.260196,15.712540
100080,90,2023-03,1.477962,1.466170e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,0.068681,2.826299e-06,23.408504,14.458942,14.260196,15.712540
100080,90,2023-04,1.399726,1.566868e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,0.034171,2.644661e-06,23.474930,14.458942,14.260196,15.712540
100080,90,2023-05,1.525930,1.620410e+10,41438.343,0.181892,-0.024072,1903000.0,1560000.0,6666060.654,-0.117421,2.557275e-06,23.508530,14.458942,14.260196,15.712540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353444,90,2024-03,0.736227,7.904795e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,0.004163,2.675936e-07,25.093321,11.002100,11.863582,13.984088
353444,90,2024-04,0.707596,7.753324e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,0.004711,2.728214e-07,25.073973,11.002100,11.863582,13.984088
353444,90,2024-05,0.609430,7.789853e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,-0.010551,2.715420e-07,25.078673,11.002100,11.863582,13.984088
353444,90,2024-06,0.461021,7.707662e+10,21152.723,0.221023,0.034105,60000.0,142000.0,1183620.035,-0.021327,2.744376e-07,25.068066,11.002100,11.863582,13.984088


In [1438]:
fm_reg_df.describe()

Unnamed: 0,beta,USD_mktval,ceq,opm,investment,di_319413,di_319414,di_319415,m_USD_ret,B/M,ln(ME),ln(GHGS1),ln(GHGS2),ln(GHGS3)
count,18940.0,18940.0,18940.0,18940.0,18940.0,18940.0,18940.0,18940.0,18940.0,18940.0,18940.0,18940.0,18940.0,18940.0
mean,1.175945,21211100000.0,4782.019109,0.185577,0.067486,910120.9,226680.1,847339.3,0.01301,2.323608e-06,21.135403,8.878584,8.936661,10.683338
std,0.924012,122459100000.0,17097.950526,0.201492,0.450417,5639518.0,1152125.0,4058064.0,0.151134,8.655701e-05,2.248115,3.55843,2.933951,2.740274
min,-3.640606,2721.0,-8957.0,0.0,-0.949444,0.0,0.075,0.311,-0.75625,-0.0003219715,7.908755,-27.631021,-2.590267,-1.167962
25%,0.63972,284025100.0,178.759,0.018953,-0.062097,833.894,1150.778,7947.504,-0.064525,2.292364e-07,19.464573,6.726106,7.048194,8.980613
50%,1.082011,1399339000.0,620.855,0.122522,0.014261,7266.0,9328.0,44912.21,0.004536,5.363232e-07,21.059266,8.890961,9.140776,10.712465
75%,1.569766,6720846000.0,2773.438,0.281589,0.106345,55669.0,50867.6,270861.0,0.079676,1.046418e-06,22.62848,10.927179,10.836981,12.509361
max,7.799617,2696790000000.0,283379.0,1.118958,8.461496,119680000.0,40880000.0,65902930.0,1.314186,0.008966556,28.623083,18.600332,17.526152,18.003694


In [1554]:
fm_reg_df = fm_reg_df[
    ["beta", "ln(ME)", "B/M", "opm", "investment", "ln(GHGS1)", "ln(GHGS2)", "ln(GHGS3)", "m_USD_ret"]
]

fm_reg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,beta,ln(ME),B/M,opm,investment,ln(GHGS1),ln(GHGS2),ln(GHGS3),m_USD_ret
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100080,90,2023-01,1.203150,23.262296,3.271265e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,0.200248
100080,90,2023-02,1.481416,23.444824,2.725491e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,-0.035668
100080,90,2023-03,1.477962,23.408504,2.826299e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,0.068681
100080,90,2023-04,1.399726,23.474930,2.644661e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,0.034171
100080,90,2023-05,1.525930,23.508530,2.557275e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,-0.117421
...,...,...,...,...,...,...,...,...,...,...,...
353444,90,2024-03,0.736227,25.093321,2.675936e-07,0.221023,0.034105,11.002100,11.863582,13.984088,0.004163
353444,90,2024-04,0.707596,25.073973,2.728214e-07,0.221023,0.034105,11.002100,11.863582,13.984088,0.004711
353444,90,2024-05,0.609430,25.078673,2.715420e-07,0.221023,0.034105,11.002100,11.863582,13.984088,-0.010551
353444,90,2024-06,0.461021,25.068066,2.744376e-07,0.221023,0.034105,11.002100,11.863582,13.984088,-0.021327


Ready for regression...

In [1555]:
fm_reg_df = fm_reg_df.rename(
    columns={
        "B/M": "BM", 
        "ln(ME)": "ln_ME", 
        "ln(GHGS1)": "ln_GHGS1", 
        "ln(GHGS2)": "ln_GHGS2", 
        "ln(GHGS3)": "ln_GHGS3", 
    }
)

fm_reg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,beta,ln_ME,BM,opm,investment,ln_GHGS1,ln_GHGS2,ln_GHGS3,m_USD_ret
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100080,90,2023-01,1.203150,23.262296,3.271265e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,0.200248
100080,90,2023-02,1.481416,23.444824,2.725491e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,-0.035668
100080,90,2023-03,1.477962,23.408504,2.826299e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,0.068681
100080,90,2023-04,1.399726,23.474930,2.644661e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,0.034171
100080,90,2023-05,1.525930,23.508530,2.557275e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,-0.117421
...,...,...,...,...,...,...,...,...,...,...,...
353444,90,2024-03,0.736227,25.093321,2.675936e-07,0.221023,0.034105,11.002100,11.863582,13.984088,0.004163
353444,90,2024-04,0.707596,25.073973,2.728214e-07,0.221023,0.034105,11.002100,11.863582,13.984088,0.004711
353444,90,2024-05,0.609430,25.078673,2.715420e-07,0.221023,0.034105,11.002100,11.863582,13.984088,-0.010551
353444,90,2024-06,0.461021,25.068066,2.744376e-07,0.221023,0.034105,11.002100,11.863582,13.984088,-0.021327


In [1556]:
fm_reg_df["m_USD_ret"] = fm_reg_df["m_USD_ret"] * 100

fm_reg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,beta,ln_ME,BM,opm,investment,ln_GHGS1,ln_GHGS2,ln_GHGS3,m_USD_ret
gvkey,iid,data_ym,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100080,90,2023-01,1.203150,23.262296,3.271265e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,20.024818
100080,90,2023-02,1.481416,23.444824,2.725491e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,-3.566813
100080,90,2023-03,1.477962,23.408504,2.826299e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,6.868132
100080,90,2023-04,1.399726,23.474930,2.644661e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,3.417142
100080,90,2023-05,1.525930,23.508530,2.557275e-06,0.181892,-0.024072,14.458942,14.260196,15.712540,-11.742101
...,...,...,...,...,...,...,...,...,...,...,...
353444,90,2024-03,0.736227,25.093321,2.675936e-07,0.221023,0.034105,11.002100,11.863582,13.984088,0.416267
353444,90,2024-04,0.707596,25.073973,2.728214e-07,0.221023,0.034105,11.002100,11.863582,13.984088,0.471143
353444,90,2024-05,0.609430,25.078673,2.715420e-07,0.221023,0.034105,11.002100,11.863582,13.984088,-1.055100
353444,90,2024-06,0.461021,25.068066,2.744376e-07,0.221023,0.034105,11.002100,11.863582,13.984088,-2.132701


In [1557]:
res = fm_reg_df.groupby(
    level="data_ym"
).apply(
    lambda ym_df: smf.ols(
      formula="m_USD_ret ~ beta + ln_ME + opm + investment + ln_GHGS1 + ln_GHGS2 + ln_GHGS3", 
      data=ym_df
    ).fit(
    ).params
)

res

Unnamed: 0_level_0,Intercept,beta,ln_ME,opm,investment,ln_GHGS1,ln_GHGS2,ln_GHGS3
data_ym,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-01,55.397809,2.697393,-2.655528,0.000883,-1.322645,-1.031222,1.190225,0.83334
2023-02,-2.578253,-1.15113,-0.257578,0.000481,-0.02242,-0.050614,-1.019182,1.470404
2023-03,-34.15735,-0.05775,1.208207,-0.000641,0.064966,0.127677,0.224995,0.201746
2023-04,-7.296438,-1.249533,0.184982,-0.020721,2.975236,0.241832,0.360274,-0.226178
2023-05,-6.937306,1.099975,0.235226,-0.000121,2.633368,-0.396361,-0.250438,0.385537
2023-06,5.603396,1.614871,-0.683354,-0.003931,-1.821868,-0.315928,-0.522999,1.99788
2023-07,13.176812,1.502173,-0.549653,0.005701,1.615875,0.254197,-0.228598,0.2196
2023-08,-21.96435,-1.363229,0.812925,-0.000448,0.963265,0.063961,0.329263,-0.222302
2023-09,-7.139234,-2.500025,0.097215,-0.004148,0.152321,0.405657,0.032118,-0.092879
2023-10,-17.603834,-1.886481,0.572787,-0.027811,-0.895569,-0.075524,-0.257158,0.408199


In [1558]:
res.mean()

Intercept    -1.815004
beta         -0.086539
ln_ME         0.101736
opm          -0.000516
investment    0.452541
ln_GHGS1     -0.038374
ln_GHGS2     -0.264218
ln_GHGS3      0.349647
dtype: float64

## Checks on Duplicates

In [None]:
""" Now determine the number of unique companies in the 
different years
"""
print(f"All years: {np.unique(df['fiscalyear'])}")

for year in np.unique(df['fiscalyear']):
    print(f"{year} => {np.unique(df[df['fiscalyear'] == year]['gvkey']).size}")

NameError: name 'df' is not defined

In [None]:
"""
All years: [2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
 2016 2017 2018 2019 2020 2021 2022 2023]
2002 => 1763
2003 => 2000
2004 => 2885
2005 => 3880
2006 => 4170
2007 => 4307
2008 => 4269
2009 => 4563
2010 => 4723
2011 => 4833
2012 => 4868
2013 => 5757
2014 => 6154
2015 => 6235
2016 => 13882
2017 => 14786
2018 => 16979
2019 => 17378
2020 => 23353
2021 => 23386
2022 => 24053
2023 => 9725
"""

In [None]:
# unique non-nan in returns
returns_gvkeys = np.unique(returns_df[~np.isnan(returns_df["gvkey"])]["gvkey"])

print(returns_gvkeys.size)

24110


In [None]:
env_ret_common_gvkeys = np.intersect1d(
    env_gvkeys, 
    returns_gvkeys, 
    assume_unique=True
)

print(env_ret_common_gvkeys.size)

24110


In [None]:
""" Check the missing gvkeys to see what country they are from """
missing_gvkeys = np.setdiff1d(env_gvkeys, env_ret_common_gvkeys, assume_unique=True)

print(missing_gvkeys.size)

7400


In [None]:
""" Create representatives and break down their distribution """
df_missing_reprs_idx = [(df["gvkey"] == gvkey).idxmax() for gvkey in missing_gvkeys]
missing_dist = df.iloc[df_missing_reprs_idx][["gvkey", "country"]].groupby(
    "country"
).count().reset_index().sort_values('gvkey', ascending=False)

In [None]:
missing_dist

Unnamed: 0,country,gvkey
79,United States,5220
10,Canada,803
78,United Kingdom,271
13,China,113
24,France,111
...,...,...
58,Peru,1
57,Panama,1
56,Pakistan,1
48,Marshall Islands,1


In [None]:
missing_dist[missing_dist["country"] != "United States"]["gvkey"].sum()

2180

In [15]:
company_dups = df[df.duplicated('gvkey', keep=False) == True].sort_values(by="gvkey")

In [26]:
last_iid = None
last_fyears = set()
for i, row in company_dups.iterrows():
    current_iid = row["institutionid"]
    current_fyear = row["fiscalyear"]
    if last_iid is not None and current_iid == last_iid:
        if current_fyear in last_fyears:
            raise Exception(f"Fyear clash!, iid: {current_iid}, prev_fyears: {last_fyears}, clash: {current_fyear}")
        else:
            last_fyears.add(current_fyear)
    else:
        last_iid = current_iid
        last_fyears.clear()
        last_fyears.add(current_fyear)

print("No issues...")

Exception: Fyear clash!, iid: 4415462, prev_fyears: {2022}, clash: 2022

In [31]:
company_dups.head(20)

Unnamed: 0,periodid,institutionid,reportedcurrencyisocode,tcprimarysectorid,fiscalyear,periodenddate,di_319380,di_319381,di_319382,di_319383,...,streetaddress3,streetaddress4,zipcode,yearfounded,monthfounded,dayfounded,officephonevalue,otherphonevalue,officefaxvalue,webpage
5,30D218CF-2E2A-46B4-AF72-CADF593290E8,4074603,USD,713A00,2022,01/01/2023,0.285,0.021,2.079,0.153,...,,,76011,1961.0,,,972 595 5000,,,www.sixflags.com
2503,1989BCA5-218F-4857-A3CE-1F2114D67F8E,4074603,USD,713A00,2023,31/12/2023,0.302,0.021,2.098,0.147,...,,,76011,1961.0,,,972 595 5000,,,www.sixflags.com
4988,5B249039-D1E2-43D2-825B-DDD233FE2FE0,4996548,USD,561300,2023,31/12/2023,0.015,0.005,0.125,0.04,...,,,75024,2007.0,,,972 692 2400,,,bgsf.com
40,5F205052-5493-41C8-9C8C-1E0FE1FC5DC0,4996548,USD,561300,2022,01/01/2023,0.014,0.005,0.122,0.041,...,,,75024,2007.0,,,972 692 2400,,,bgsf.com
1,01C53196-7DD9-42C7-9D3B-F152BFB3A364,4054841,USD,445000A,2022,01/01/2023,3.061,0.003,186.279,0.203,...,,,1506 MA,1867.0,,,31 88 659 9111,,,www.aholddelhaize.com
2462,793C84DE-3E9D-4CE0-8768-647BA85F9272,4054841,USD,445000A,2023,31/12/2023,3.183,0.003,181.084,0.189,...,,,1506 MA,1867.0,,,31 88 659 9111,,,www.aholddelhaize.com
110,A706AF20-7252-4C95-AA58-2481363942C5,10175068,USD,722000,2022,02/01/2023,0.095,0.053,0.696,0.389,...,,,33309,2011.0,,,954-618-2000,,,www.burgerfi.com
6071,96731D0E-B576-41E4-8386-DAD27811DF60,10175068,USD,722000,2023,01/01/2024,0.092,0.054,0.625,0.368,...,,,33309,2011.0,,,954-618-2000,,,www.burgerfi.com
99,0446986D-7007-4B72-AA5B-49AA5ADB4CF2,28295169,USD,541512,2022,01/01/2023,0.042,0.02,0.362,0.17,...,,,55425,2016.0,,,952 851 5200,,,www.skywatertechnology.com
6011,9AABC54A-2E82-4C6A-99E2-489EF08AE342,28295169,USD,541512,2023,31/12/2023,0.051,0.018,0.42,0.147,...,,,55425,2016.0,,,952 851 5200,,,www.skywatertechnology.com


In [32]:
company_dups[company_dups["institutionid"] == 4415462]

Unnamed: 0,periodid,institutionid,reportedcurrencyisocode,tcprimarysectorid,fiscalyear,periodenddate,di_319380,di_319381,di_319382,di_319383,...,streetaddress3,streetaddress4,zipcode,yearfounded,monthfounded,dayfounded,officephonevalue,otherphonevalue,officefaxvalue,webpage
23,5282F3CE-9CFA-4830-9116-3B36EFDE5089,4415462,USD,52A000,2022,01/01/2023,0.001,0.001,0.054,0.037,...,,,50059.0,1993.0,,,7 727 244 5484,,7 727 244 5480,www.homecredit.kz
24,5282F3CE-9CFA-4830-9116-3B36EFDE5089,4415462,USD,52A000,2022,01/01/2023,0.001,0.001,0.054,0.037,...,,,,,,,,,,


array(['United States', 'Netherlands', 'United Kingdom', 'Canada',
       'Kazakhstan', 'Belarus', 'Australia', 'Belgium', 'Austria',
       'Finland', 'Ireland', nan, 'Singapore', 'France', 'Denmark',
       'Japan', 'Israel', 'Italy', 'South Africa', 'Thailand', 'Germany',
       'China', 'Hong Kong', 'Luxembourg', 'India', 'Switzerland',
       'Malaysia', 'South Korea', 'Kenya', 'New Zealand', 'Spain',
       'Pakistan', 'Saudi Arabia', 'Sweden', 'British Virgin Islands',
       'Kuwait', 'Turkey', 'Philippines', 'Mauritius', 'Bangladesh',
       'Cayman Islands', 'Botswana', 'Egypt', 'Malta', 'Malawi',
       'Jamaica', 'Bermuda', 'Colombia', 'Mexico', 'Norway', 'Brazil',
       'Bahrain', 'Morocco', 'Indonesia', 'Romania', 'Russia',
       'Ivory Coast', 'Tunisia', 'Greece', 'Vietnam', 'Taiwan', 'Nigeria',
       'Oman', 'Qatar', 'Portugal', 'United Arab Emirates', 'Jersey',
       'Poland', 'Bulgaria', 'Chile', 'Reunion', 'Ghana', 'Monaco',
       'Bahamas', 'Guernsey'], dtype=o