# 

# Phase 1
> 2022-2024; NA Listed Firms;

## Imports

In [1]:
import pandas as pd
import numpy as np

## Load Emissions Data

In [112]:
""" Read in the CSV """
raw_emissions_df = pd.read_csv(
    "phase1_emissions.csv", 
    dtype={"companyid": "str", "gvkey": "str"} #identifiers are read in
)

raw_emissions_df

Unnamed: 0,institutionid,fiscalyear,periodenddate,di_319413,di_319414,di_319415,companyid,gvkey,companyname,country
0,11489,2022,2022-12-31,1859.947024,5704.373536,78880.089343,6520204,,Factory Mutual Insurance Company,United States
1,11489,2022,2022-12-31,1859.947024,5704.373536,78880.089343,24951392,,"Factory Mutual Insurance Company, Asset Manage...",United States
2,11489,2022,2022-12-31,1859.947024,5704.373536,78880.089343,43964734,,"Allendale Mutual Insurance Co, Asset Managemen...",United States
3,11489,2023,2023-12-31,4154.828402,12742.671094,176205.682880,6520204,,Factory Mutual Insurance Company,United States
4,11489,2023,2023-12-31,4154.828402,12742.671094,176205.682880,24951392,,"Factory Mutual Insurance Company, Asset Manage...",United States
...,...,...,...,...,...,...,...,...,...,...
1803283,118526590,2022,2022-12-31,485.540053,1085.798816,1022.548683,111952300,,Coseva Soc Coop,Italy
1803284,118706040,2022,2022-12-31,205.995561,658.395477,1709.587413,1886594015,,Solà Domingo S.A.,Spain
1803285,118918440,2022,2022-07-31,40.124678,26.875831,83.556928,1887792515,,"Nihonsangyo Co.,Ltd.",Japan
1803286,118991426,2022,2022-12-31,59.719416,216.809138,129.375410,1888091056,,CP Management Spólka Z Ograniczona Odpowiedzia...,Poland


Inspect the loaded datatypes

In [113]:
raw_emissions_df.dtypes

institutionid      int64
fiscalyear         int64
periodenddate     object
di_319413        float64
di_319414        float64
di_319415        float64
companyid         object
gvkey             object
companyname       object
country           object
dtype: object

The company ids are extracted and written out to a file

In [3]:
""" Utility functions regarding validity, uniqueness, and writing out 
unique and valid ids. """

def keep_valid(data, colname=None):
    """ Only keeps the rows/items from the given dataframe/array-like which 
    have valid values - for the specified column in the case of a dataframe.

    Args:
        data: Dataframe or array-like.
        colname: Name of the dataframe column whose valid values we are using to 
            filter.

    Returns:
        The dataframe/array-like with rows/items that have invalid values, w.r.t 
        the specified column if applicable, filtered out.
    """

    # Array whose values we are interested in
    col = data[colname] if colname is not None else data
    # Values are considered valid as long as they are not NaN
    return data[pd.notnull(col)]


def extract_unique(df, colname):
    """ Extract the unique and non-NaN values from a dataframe column.
    
    Args:
        df: Dataframe.
        colname: Column name w.r.t the dataframe.

    Returns:
        Unique and non-NaN column values.
    """

    return keep_valid(pd.unique(df[colname]))


def write_ids(df, idname, filename):
    """ Writes the unique, non-NaN instances of the indicated identifier, within the 
    indicated dataframe, on separate lines of a new file, whose filename should 
    be specified.

    Args:
        df: Dataframe.
        idname: Column name of the identifier with respect to the dataframe.
        filename: The name to use for the newly created file.
    """
    with open(filename, "w") as fh:
        for idval in extract_unique(df, idname):
            fh.write(f"{idval}\n")

In [4]:
""" Export CIQ company ids """
write_ids(raw_emissions_df, "companyid", "companyids.txt")

The number of unique company ids and gvkeys are reported for this raw emissions data

In [105]:
""" Utility function """
def report_unique(df, colname):
    """ Reports unique, non-NaN values of a dataframe column
    
    Args:
        df: Dataframe.
        colname: Column name.
    """

    print(f"Number of unique, non-null values of \'{colname}\': {len(extract_unique(df, colname))}")

In [107]:
print("-- Raw Emissions Data")
report_unique(raw_emissions_df, "institutionid")
report_unique(raw_emissions_df, "companyid")
report_unique(raw_emissions_df, "gvkey")

-- Raw Emissions Data
Number of unique, non-null values of 'institutionid': 1720932
Number of unique, non-null values of 'companyid': 1725359
Number of unique, non-null values of 'gvkey': 24304


In [155]:
(raw_emissions_df["companyid"].isnull() & raw_emissions_df["gvkey"].notnull()).sum()

0

In [150]:
test_df = raw_emissions_df.groupby(["gvkey", "fiscalyear"]).agg(
    {
        # "institutionid": lambda vals: (vals.iloc[0] == vals).all() if vals.count() != 1 else True, 
        "di_319413": lambda vals: np.nan if (vals.iloc[0] == vals).all() else " ".join(vals.astype(str)), 
        "di_319414": lambda vals: np.nan if (vals.iloc[0] == vals).all() else " ".join(vals.astype(str)), 
        "di_319415": lambda vals: np.nan if (vals.iloc[0] == vals).all() else " ".join(vals.astype(str)), 
    }
)

test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,di_319413,di_319414,di_319415
gvkey,fiscalyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
001004,2022,,,
001045,2022,,,
001045,2023,,,
001050,2022,,,
001050,2023,,,
...,...,...,...,...
362683,2022,,,
362705,2022,,,
362758,2022,,,
362761,2022,,,


In [152]:
test_df[pd.notnull(test_df["di_319413"]) | pd.notnull(test_df["di_319414"]) | pd.notnull(test_df["di_319415"])]

Unnamed: 0_level_0,Unnamed: 1_level_0,di_319413,di_319414,di_319415
gvkey,fiscalyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
003413,2022,2867127.0 51062759.241,1018275.0 3933.0872579,1650402.398 2024652.4121
003897,2022,27114550.031 25358937.262,197000.0 1953.2613298,4025432.601 1005488.8192
004094,2022,810.06 1410.524 7995.579,7696.0 2956.39 9723.273,197193.363 84877.747 39817.706
005180,2022,636360.11 53101.342601,251038.0 32063.589498,2436669.603 254265.62927
005600,2022,1080082.1763 4073314.057,61712.663909 73116.0,850262.94718 353150.506
...,...,...,...,...
275535,2022,734.626 112.41037212,4498.412 111.2256901,8751.123 2720.5732053
275535,2023,611.138 80.405872624,3819.836 79.558482923,6836.914 1945.9953604
289724,2022,101.653 0.0053786782,1642.732 0.0869208615,7071.309 0.3741598245
295786,2022,134627.057 0.9207597207,129177.195 3.5451698548,5009667.962 64.639735408


In [139]:
test_df = raw_emissions_df.groupby("gvkey").agg({"fiscalyear": lambda vals: vals.count() == 3})

In [142]:
test_df[test_df["fiscalyear"]]

Unnamed: 0_level_0,fiscalyear
gvkey,Unnamed: 1_level_1
001107,True
001913,True
003413,True
003897,True
004186,True
...,...
292533,True
294733,True
294737,True
313594,True


## Load Cid-Gvkey Mappings
> From the relevent CIQ Linking Web Query

In [100]:
cid_gvkey_df = pd.read_csv("cid_gvkey_map.csv")

cid_gvkey_df

Unnamed: 0,companyid,gvkey,startdate,enddate,companyname
0,18511,210835,B,E,3i Group plc
1,18527,210418,B,E,ABB Ltd
2,18671,29751,B,E,Albemarle Corporation
3,18711,28349,B,E,The Allstate Corporation
4,18749,64768,B,E,"Amazon.com, Inc."
...,...,...,...,...,...
24388,1849817514,361808,B,E,Kawan Renergy Berhad
24389,1855364409,358709,B,E,ELSA Solutions S.p.A.
24390,1855399529,358653,B,E,"SEIYU KOGYO Co.,Ltd."
24391,1859487646,359029,B,E,KET Inc.


Inspect the loaded datatypes

In [101]:
cid_gvkey_df.dtypes

companyid       int64
gvkey           int64
startdate      object
enddate        object
companyname    object
dtype: object

Keep the entries of the 1-to-1 mappings

In [103]:
cid_gvkey_1t1_df = cid_gvkey_df[(cid_gvkey_df["startdate"] == "B") & (cid_gvkey_df["enddate"] == "E")]

cid_gvkey_1t1_df

Unnamed: 0,companyid,gvkey,startdate,enddate,companyname
0,18511,210835,B,E,3i Group plc
1,18527,210418,B,E,ABB Ltd
2,18671,29751,B,E,Albemarle Corporation
3,18711,28349,B,E,The Allstate Corporation
4,18749,64768,B,E,"Amazon.com, Inc."
...,...,...,...,...,...
24388,1849817514,361808,B,E,Kawan Renergy Berhad
24389,1855364409,358709,B,E,ELSA Solutions S.p.A.
24390,1855399529,358653,B,E,"SEIYU KOGYO Co.,Ltd."
24391,1859487646,359029,B,E,KET Inc.


In [161]:
raw_emissions_df[pd.notnull(raw_emissions_df["gvkey"])]

Unnamed: 0,institutionid,fiscalyear,periodenddate,di_319413,di_319414,di_319415,companyid,gvkey,companyname,country
26,13959,2022,2022-12-31,18853.165084,72589.700681,1.323541e+06,246652,122594,State Farm Mutual Automobile Insurance Company,United States
29,13959,2023,2023-12-31,23250.617734,89521.063147,1.632254e+06,246652,122594,State Farm Mutual Automobile Insurance Company,United States
32,14193,2022,2022-12-31,16736.521444,51330.155513,7.097935e+05,417155,122554,United Services Automobile Association,United States
33,14193,2023,2023-12-31,19585.900250,60069.071644,8.306352e+05,417155,122554,United Services Automobile Association,United States
34,14432,2022,2022-12-31,1027.993067,3152.808316,4.359704e+04,3544839,063734,Voya Retirement Insurance and Annuity Company,United States
...,...,...,...,...,...,...,...,...,...,...
1796814,112329550,2022,2022-10-31,489.910950,1022.660743,6.379460e+03,1849817514,361808,Kawan Renergy Berhad,Malaysia
1803250,113511623,2022,2022-12-31,291.908186,183.331947,5.197957e+02,1855364409,358709,ELSA Solutions S.p.A.,Italy
1803251,113511665,2022,2022-09-30,12.539324,0.790235,6.400957e+00,1855399529,358653,"SEIYU KOGYO Co.,Ltd.",Japan
1803259,114230580,2022,2022-03-31,2.811856,0.177205,1.435370e+00,1859487646,359029,KET Inc.,Japan


In [157]:
emissions_df = raw_emissions_df[
    np.in1d(raw_emissions_df["gvkey"], cid_gvkey_1t1_df["gvkey"].astype(str))
]

emissions_df

Unnamed: 0,institutionid,fiscalyear,periodenddate,di_319413,di_319414,di_319415,companyid,gvkey,companyname,country
26,13959,2022,2022-12-31,18853.165084,72589.700681,1.323541e+06,246652,122594,State Farm Mutual Automobile Insurance Company,United States
29,13959,2023,2023-12-31,23250.617734,89521.063147,1.632254e+06,246652,122594,State Farm Mutual Automobile Insurance Company,United States
32,14193,2022,2022-12-31,16736.521444,51330.155513,7.097935e+05,417155,122554,United Services Automobile Association,United States
33,14193,2023,2023-12-31,19585.900250,60069.071644,8.306352e+05,417155,122554,United Services Automobile Association,United States
38,14467,2022,2022-12-31,799.396226,2451.712127,3.390228e+04,675974,263562,Everlake Life Insurance Company,United States
...,...,...,...,...,...,...,...,...,...,...
1796814,112329550,2022,2022-10-31,489.910950,1022.660743,6.379460e+03,1849817514,361808,Kawan Renergy Berhad,Malaysia
1803250,113511623,2022,2022-12-31,291.908186,183.331947,5.197957e+02,1855364409,358709,ELSA Solutions S.p.A.,Italy
1803251,113511665,2022,2022-09-30,12.539324,0.790235,6.400957e+00,1855399529,358653,"SEIYU KOGYO Co.,Ltd.",Japan
1803259,114230580,2022,2022-03-31,2.811856,0.177205,1.435370e+00,1859487646,359029,KET Inc.,Japan


In [159]:
check_df = emissions_df.groupby(["gvkey", "fiscalyear"]).agg(
    {
        "di_319413": "count", 
        "di_319414": "count", 
        "di_319415": "count", 
    }
)

In [160]:
check_df[(check_df["di_319413"] != 1) | (check_df["di_319414"] != 1) | (check_df["di_319415"] != 1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,di_319413,di_319414,di_319415
gvkey,fiscalyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


Back to the original mappings, check for null and duplicated values for 
both Cid and Gvkey.

In [45]:
print("-- Null Counts")
print(f"{cid_gvkey_df["companyid"].isnull().sum()}")
print(cid_gvkey_df["gvkey"].isnull().sum())
print("-- Duplicates Present? (T/F)")
print(cid_gvkey_df["companyid"].duplicated().any())
print(raw_emissions_df["companyid"].duplicated().any())

0
0
True
True


In [86]:
def ensure_consistency(vals):
    if vals.count() == 0:
        return -1

    vals = keep_valid(vals)

    if (vals.iloc[0] == vals).all():
        # return vals.iloc[0]
        return np.nan
    else:
        return " ".join(vals)

emissions_df = raw_emissions_df.groupby(["companyid"]).agg(
    {"gvkey": lambda vals: ensure_consistency(vals)}
)

emissions_df

Unnamed: 0_level_0,gvkey
companyid,Unnamed: 1_level_1
100000307,-1
100013,
1000212,-1
1000277,-1
10004497,
...,...
99996472,-1
99996476,-1
99996998,-1
99997106,-1


In [79]:
pd.unique(emissions_df["gvkey"])

array([nan, ''], dtype=object)

In [83]:
(pd.notnull(emissions_df["gvkey"])).sum()

177

In [87]:
emissions_df[pd.isnull(emissions_df["gvkey"])]

Unnamed: 0_level_0,gvkey
companyid,Unnamed: 1_level_1
100013,
10004497,
10004724,
10004802,
10004874,
...,...
9994690,
9996089,
99985539,
99990431,


In [93]:
gvkey_gb = raw_emissions_df.groupby(["gvkey", "fiscalyear"]).agg(
    {"companyid": lambda vals: vals.count()}
)

In [95]:
gvkey_gb[gvkey_gb["companyid"] == 2]

Unnamed: 0_level_0,Unnamed: 1_level_0,companyid
gvkey,fiscalyear,Unnamed: 2_level_1
002856,2022,2
002856,2023,2
003413,2022,2
003897,2022,2
005180,2022,2
...,...,...
275535,2022,2
275535,2023,2
289724,2022,2
295786,2022,2


In [42]:
emissions_df.loc[
    cid_gvkey_df["companyid"].astype(str), "gvkey"
] = cid_gvkey_df.astype({"companyid": "str"}).set_index("companyid")["gvkey"].astype(str)

  emissions_df.loc[


ValueError: cannot reindex on an axis with duplicate labels

In [31]:
pd.unique(emissions_df["gvkey"])

array([nan], dtype=object)

In [27]:
len(extract_unique(raw_emissions_df, "gvkey"))

24304

keep going

In [40]:
""" Quality control """
print(f"Prior number of rows {len(raw_emissions_df)}")
emissions_df = raw_emissions_df[~np.isnan(raw_emissions_df["gvkey"])]
emissions_df = emissions_df.drop(columns=["di_326737", "di_367750"])
emissions_df = emissions_df.dropna()
print(f"Subsequent number of rows {len(emissions_df)}")

Prior number of rows 1803288
Subsequent number of rows 33997


In [32]:
emissions_df

Unnamed: 0,institutionid,fiscalyear,periodenddate,di_319413,di_319414,di_319415,di_326737,di_367750,gvkey,companyname,country
26,13959,2022,2022-12-31,18853.165084,72589.700681,1.323541e+06,,,122594.0,State Farm Mutual Automobile Insurance Company,United States
29,13959,2023,2023-12-31,23250.617734,89521.063147,1.632254e+06,,,122594.0,State Farm Mutual Automobile Insurance Company,United States
32,14193,2022,2022-12-31,16736.521444,51330.155513,7.097935e+05,,,122554.0,United Services Automobile Association,United States
33,14193,2023,2023-12-31,19585.900250,60069.071644,8.306352e+05,,,122554.0,United Services Automobile Association,United States
34,14432,2022,2022-12-31,1027.993067,3152.808316,4.359704e+04,,,63734.0,Voya Retirement Insurance and Annuity Company,United States
...,...,...,...,...,...,...,...,...,...,...,...
1796814,112329550,2022,2022-10-31,489.910950,1022.660743,6.379460e+03,,,361808.0,Kawan Renergy Berhad,Malaysia
1803250,113511623,2022,2022-12-31,291.908186,183.331947,5.197957e+02,,,358709.0,ELSA Solutions S.p.A.,Italy
1803251,113511665,2022,2022-09-30,12.539324,0.790235,6.400957e+00,,,358653.0,"SEIYU KOGYO Co.,Ltd.",Japan
1803259,114230580,2022,2022-03-31,2.811856,0.177205,1.435370e+00,,,359029.0,KET Inc.,Japan


In [19]:
df.groupby(["gvkey", "fiscalyear"]).size().index

MultiIndex([(  1004.0, 2022),
            (  1045.0, 2022),
            (  1045.0, 2023),
            (  1050.0, 2022),
            (  1050.0, 2023),
            (  1075.0, 2022),
            (  1076.0, 2022),
            (  1078.0, 2022),
            (  1078.0, 2023),
            (  1096.0, 2022),
            ...
            (362534.0, 2022),
            (362534.0, 2023),
            (362619.0, 2022),
            (362620.0, 2022),
            (362620.0, 2023),
            (362683.0, 2022),
            (362705.0, 2022),
            (362758.0, 2022),
            (362761.0, 2022),
            (362761.0, 2023)],
           names=['gvkey', 'fiscalyear'], length=33895)

## Load Market Data

In [23]:
market_df = pd.read_csv("phase1_market.csv")

market_df

Unnamed: 0,gvkeyx,prccm,datadate,conm,tic
0,150918,3379.0814,2022-01-31,S&P Global 1200 Index,I6UNK112
1,150918,3283.8722,2022-02-28,S&P Global 1200 Index,I6UNK112
2,150918,3361.597,2022-03-31,S&P Global 1200 Index,I6UNK112
3,150918,3089.862,2022-04-30,S&P Global 1200 Index,I6UNK112
4,150918,3097.8104,2022-05-31,S&P Global 1200 Index,I6UNK112
5,150918,2827.3566,2022-06-30,S&P Global 1200 Index,I6UNK112
6,150918,3032.165,2022-07-31,S&P Global 1200 Index,I6UNK112
7,150918,2899.363,2022-08-31,S&P Global 1200 Index,I6UNK112
8,150918,2619.9194,2022-09-30,S&P Global 1200 Index,I6UNK112
9,150918,2795.4613,2022-10-31,S&P Global 1200 Index,I6UNK112


## Missing data checks to prepare for imputation

In [None]:
""" Missing Data Checks """

In [4]:
""" Number of unique companies across all the years """
np.unique(df["gvkey"]).size # roughly 30k

31511

In [10]:
""" Now determine the number of unique companies in the 
different years
"""
print(f"All years: {np.unique(df['fiscalyear'])}")

for year in np.unique(df['fiscalyear']):
    print(f"{year} => {np.unique(df[df['fiscalyear'] == year]['gvkey']).size}")

All years: [2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
 2016 2017 2018 2019 2020 2021 2022 2023]
2002 => 1763
2003 => 2000
2004 => 2885
2005 => 3880
2006 => 4170
2007 => 4307
2008 => 4269
2009 => 4563
2010 => 4723
2011 => 4833
2012 => 4868
2013 => 5757
2014 => 6154
2015 => 6235
2016 => 13882
2017 => 14786
2018 => 16979
2019 => 17378
2020 => 23353
2021 => 23386
2022 => 24053
2023 => 9725


In [None]:
# print(df[])

## Write out all the unique gvkeys

## Read in the daily security prices csv

In [16]:
returns_df = pd.read_csv("daily.csv")

  returns_df = pd.read_csv("daily.csv")


In [17]:
returns_df

Unnamed: 0,gvkey,iid,datadate,conm,trfd,isin,county,sic
0,1166,01W,2002-01-02,ASM INTERNATIONAL NV,1.000000,NL0000334118,,3559.0
1,1166,01W,2002-01-03,ASM INTERNATIONAL NV,1.000000,NL0000334118,,3559.0
2,1166,01W,2002-01-04,ASM INTERNATIONAL NV,1.000000,NL0000334118,,3559.0
3,1166,01W,2002-01-07,ASM INTERNATIONAL NV,1.000000,NL0000334118,,3559.0
4,1166,01W,2002-01-08,ASM INTERNATIONAL NV,1.000000,NL0000334118,,3559.0
...,...,...,...,...,...,...,...,...
113958618,362779,01W,2024-08-06,NOVAMARINE SPA,1.145429,IT0005605701,,
113958619,362779,01W,2024-08-07,NOVAMARINE SPA,1.145429,IT0005605701,,
113958620,362779,01W,2024-08-08,NOVAMARINE SPA,1.145429,IT0005605701,,
113958621,362779,01W,2024-08-09,NOVAMARINE SPA,1.145429,IT0005605701,,


## Check the intersect between the gvkeys of the carbon emissions dataset and the daily prices dataset

In [31]:
# unique non-nan in environment
env_gvkeys = np.unique(df[~np.isnan(df["gvkey"])]["gvkey"])

print(env_gvkeys.size)

31510


In [32]:
# unique non-nan in returns
returns_gvkeys = np.unique(returns_df[~np.isnan(returns_df["gvkey"])]["gvkey"])

print(returns_gvkeys.size)

24110


In [33]:
env_ret_common_gvkeys = np.intersect1d(
    env_gvkeys, 
    returns_gvkeys, 
    assume_unique=True
)

print(env_ret_common_gvkeys.size)

24110


In [34]:
""" Check the missing gvkeys to see what country they are from """
missing_gvkeys = np.setdiff1d(env_gvkeys, env_ret_common_gvkeys, assume_unique=True)

print(missing_gvkeys.size)

7400


In [77]:
""" Create representatives and break down their distribution """
df_missing_reprs_idx = [(df["gvkey"] == gvkey).idxmax() for gvkey in missing_gvkeys]
missing_dist = df.iloc[df_missing_reprs_idx][["gvkey", "country"]].groupby(
    "country"
).count().reset_index().sort_values('gvkey', ascending=False)

In [78]:
missing_dist

Unnamed: 0,country,gvkey
79,United States,5220
10,Canada,803
78,United Kingdom,271
13,China,113
24,France,111
...,...,...
58,Peru,1
57,Panama,1
56,Pakistan,1
48,Marshall Islands,1


In [79]:
missing_dist[missing_dist["country"] != "United States"]["gvkey"].sum()

2180

## Checks on Duplicates

In [15]:
company_dups = df[df.duplicated('gvkey', keep=False) == True].sort_values(by="gvkey")

In [26]:
last_iid = None
last_fyears = set()
for i, row in company_dups.iterrows():
    current_iid = row["institutionid"]
    current_fyear = row["fiscalyear"]
    if last_iid is not None and current_iid == last_iid:
        if current_fyear in last_fyears:
            raise Exception(f"Fyear clash!, iid: {current_iid}, prev_fyears: {last_fyears}, clash: {current_fyear}")
        else:
            last_fyears.add(current_fyear)
    else:
        last_iid = current_iid
        last_fyears.clear()
        last_fyears.add(current_fyear)

print("No issues...")

Exception: Fyear clash!, iid: 4415462, prev_fyears: {2022}, clash: 2022

In [31]:
company_dups.head(20)

Unnamed: 0,periodid,institutionid,reportedcurrencyisocode,tcprimarysectorid,fiscalyear,periodenddate,di_319380,di_319381,di_319382,di_319383,...,streetaddress3,streetaddress4,zipcode,yearfounded,monthfounded,dayfounded,officephonevalue,otherphonevalue,officefaxvalue,webpage
5,30D218CF-2E2A-46B4-AF72-CADF593290E8,4074603,USD,713A00,2022,01/01/2023,0.285,0.021,2.079,0.153,...,,,76011,1961.0,,,972 595 5000,,,www.sixflags.com
2503,1989BCA5-218F-4857-A3CE-1F2114D67F8E,4074603,USD,713A00,2023,31/12/2023,0.302,0.021,2.098,0.147,...,,,76011,1961.0,,,972 595 5000,,,www.sixflags.com
4988,5B249039-D1E2-43D2-825B-DDD233FE2FE0,4996548,USD,561300,2023,31/12/2023,0.015,0.005,0.125,0.04,...,,,75024,2007.0,,,972 692 2400,,,bgsf.com
40,5F205052-5493-41C8-9C8C-1E0FE1FC5DC0,4996548,USD,561300,2022,01/01/2023,0.014,0.005,0.122,0.041,...,,,75024,2007.0,,,972 692 2400,,,bgsf.com
1,01C53196-7DD9-42C7-9D3B-F152BFB3A364,4054841,USD,445000A,2022,01/01/2023,3.061,0.003,186.279,0.203,...,,,1506 MA,1867.0,,,31 88 659 9111,,,www.aholddelhaize.com
2462,793C84DE-3E9D-4CE0-8768-647BA85F9272,4054841,USD,445000A,2023,31/12/2023,3.183,0.003,181.084,0.189,...,,,1506 MA,1867.0,,,31 88 659 9111,,,www.aholddelhaize.com
110,A706AF20-7252-4C95-AA58-2481363942C5,10175068,USD,722000,2022,02/01/2023,0.095,0.053,0.696,0.389,...,,,33309,2011.0,,,954-618-2000,,,www.burgerfi.com
6071,96731D0E-B576-41E4-8386-DAD27811DF60,10175068,USD,722000,2023,01/01/2024,0.092,0.054,0.625,0.368,...,,,33309,2011.0,,,954-618-2000,,,www.burgerfi.com
99,0446986D-7007-4B72-AA5B-49AA5ADB4CF2,28295169,USD,541512,2022,01/01/2023,0.042,0.02,0.362,0.17,...,,,55425,2016.0,,,952 851 5200,,,www.skywatertechnology.com
6011,9AABC54A-2E82-4C6A-99E2-489EF08AE342,28295169,USD,541512,2023,31/12/2023,0.051,0.018,0.42,0.147,...,,,55425,2016.0,,,952 851 5200,,,www.skywatertechnology.com


In [32]:
company_dups[company_dups["institutionid"] == 4415462]

Unnamed: 0,periodid,institutionid,reportedcurrencyisocode,tcprimarysectorid,fiscalyear,periodenddate,di_319380,di_319381,di_319382,di_319383,...,streetaddress3,streetaddress4,zipcode,yearfounded,monthfounded,dayfounded,officephonevalue,otherphonevalue,officefaxvalue,webpage
23,5282F3CE-9CFA-4830-9116-3B36EFDE5089,4415462,USD,52A000,2022,01/01/2023,0.001,0.001,0.054,0.037,...,,,50059.0,1993.0,,,7 727 244 5484,,7 727 244 5480,www.homecredit.kz
24,5282F3CE-9CFA-4830-9116-3B36EFDE5089,4415462,USD,52A000,2022,01/01/2023,0.001,0.001,0.054,0.037,...,,,,,,,,,,


array(['United States', 'Netherlands', 'United Kingdom', 'Canada',
       'Kazakhstan', 'Belarus', 'Australia', 'Belgium', 'Austria',
       'Finland', 'Ireland', nan, 'Singapore', 'France', 'Denmark',
       'Japan', 'Israel', 'Italy', 'South Africa', 'Thailand', 'Germany',
       'China', 'Hong Kong', 'Luxembourg', 'India', 'Switzerland',
       'Malaysia', 'South Korea', 'Kenya', 'New Zealand', 'Spain',
       'Pakistan', 'Saudi Arabia', 'Sweden', 'British Virgin Islands',
       'Kuwait', 'Turkey', 'Philippines', 'Mauritius', 'Bangladesh',
       'Cayman Islands', 'Botswana', 'Egypt', 'Malta', 'Malawi',
       'Jamaica', 'Bermuda', 'Colombia', 'Mexico', 'Norway', 'Brazil',
       'Bahrain', 'Morocco', 'Indonesia', 'Romania', 'Russia',
       'Ivory Coast', 'Tunisia', 'Greece', 'Vietnam', 'Taiwan', 'Nigeria',
       'Oman', 'Qatar', 'Portugal', 'United Arab Emirates', 'Jersey',
       'Poland', 'Bulgaria', 'Chile', 'Reunion', 'Ghana', 'Monaco',
       'Bahamas', 'Guernsey'], dtype=o