In [9]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import logging

sys.path.append('../')

from utils import Graph

graph = Graph('bolt://localhost:7687', 'neo4j', 'neo4jneo4j')

In [10]:
logging.basicConfig(filename='../data/logs/link_prediction_country_rem.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO)

## Companies in Taiwan

First let us find all the companies around the world that are connected to companies in Taiwan.

In [11]:
# Companies that are connected to a company in Taiwan

def find_comp_in_country(country_code:str) -> pd.DataFrame:
    """ 
    
    This function takes the country code as input and returns all companies
    that are connected to a company in that country.
    Parameters:
    -----------
    country_code: str
        The country code of the country of interest.
    Returns:
    --------
    df: pandas dataframe
        A dataframe with all companies connected to a company in the country
        of interest.

    """
    query = f"""
    MATCH (n:Company)-[:SUPPLIES_TO]->(m:Company)
    WHERE n.country_code='{country_code}'
    RETURN n.name,m.name
    """

    return graph.query_run_df(query,{})

In [12]:
find_comp_in_country('TWN')

Unnamed: 0,n.name,m.name
0,"Himax Technologies, Inc.","Excel Co., Ltd."
1,"Himax Technologies, Inc.",Sharp Corp.
2,"Himax Technologies, Inc.",Tecno Telecom Ltd.
3,"Himax Technologies, Inc.","Samsung Electronics Co., Ltd."
4,"Himax Technologies, Inc.","LG Innotek Co., Ltd."
...,...,...
5626,APS Advanced Printing Systems Co.,"Abaxis, Inc."
5627,Analog Integrations Corp.,"Avnet, Inc."
5628,Aurora Corp.,"Heran Co., Ltd."
5629,Aurotek Corp.,"Satori Electric Co., Ltd."


Maybe the idea should be to remove all the taiwan relationships after a particular time and sample the relationships so that the training, validation and testing datasets have them. This way we can ensure that predicting Taiwan as a suitable relation is not learned by the model (Hopefully!).

In [13]:
query_all_rel = """ 
MATCH (m:Company)-[r:SUPPLIES_TO]->(n:Company)
WHERE m.country_code='TWN'
RETURN m.name, r.revenue_pct, n.name
"""

query_all_rel_after_2020 = """ 
MATCH (m:Company)-[r:SUPPLIES_TO]->(n:Company)
WHERE m.country_code='TWN' and datetime(r.date).year > 2020
RETURN m.name, r.revenue_pct, n.name
"""


graph.query_run_df(query_all_rel,{})

Unnamed: 0,m.name,r.revenue_pct,n.name
0,"Himax Technologies, Inc.",-999.0,"Excel Co., Ltd."
1,"Himax Technologies, Inc.",-999.0,Sharp Corp.
2,"Himax Technologies, Inc.",-999.0,Tecno Telecom Ltd.
3,"Himax Technologies, Inc.",-999.0,"Samsung Electronics Co., Ltd."
4,"Himax Technologies, Inc.",-999.0,"LG Innotek Co., Ltd."
...,...,...,...
5626,APS Advanced Printing Systems Co.,-999.0,"Abaxis, Inc."
5627,Analog Integrations Corp.,-999.0,"Avnet, Inc."
5628,Aurora Corp.,-999.0,"Heran Co., Ltd."
5629,Aurotek Corp.,-999.0,"Satori Electric Co., Ltd."


# Drop all Graphs

In [None]:
query_graph_drop_all = """ 
CALL gds.graph.list() YIELD graphName
UNWIND graphName as t
CALL gds.graph.drop(t) YIELD schema
RETURN schema
"""

graph.query_run(query_graph_drop_all,{})

# Training Set

In [16]:
query_positive_examples = """ 
MATCH (m:Company)-[r:SUPPLIES_TO]->(n:Company)
RETURN m.name,m.country_code, r.revenue_pct, datetime(r.date).year, n.name
"""

positive_examples = graph.query_run_df(query_positive_examples,{})

In [19]:
positive_examples

Unnamed: 0,m.name,m.country_code,r.revenue_pct,datetime(r.date).year,n.name
0,"Infinity Pharmaceuticals, Inc.",USA,-999.0,2021,"Secura Bio, Inc."
1,"Infinity Pharmaceuticals, Inc.",USA,-999.0,2021,"PellePharm, Inc."
2,"Bozhon Precision Industry Technology Co., Ltd.",CHN,-999.0,2020,"Gree Electric Appliances, Inc. of Zhuhai"
3,"Bozhon Precision Industry Technology Co., Ltd.",CHN,-999.0,2020,"Hon Hai Precision Industry Co., Ltd."
4,"Bozhon Precision Industry Technology Co., Ltd.",CHN,-999.0,2020,"Quanta Computer, Inc."
...,...,...,...,...,...
233747,Aviation Lease & Finance Co. KSCC,KWT,-999.0,2021,International Consolidated Airlines Group SA
233748,"Avid Ratings, Inc.",USA,-999.0,2017,"Bazaarvoice, Inc."
233749,Awami Frutos Seco SL,ESP,-999.0,2021,National Foods Ltd. (Pakistan)
233750,Axis REIT Managers Bhd.,MYS,-999.0,2017,Axis Real Estate Investment Trust


In [21]:
link_counts = positive_examples.groupby(['datetime(r.date).year','m.country_code']).count().reset_index()

In [22]:
link_counts.loc[link_counts['m.country_code']=='TWN']

Unnamed: 0,datetime(r.date).year,m.country_code,m.name,r.revenue_pct,n.name
9,2003,TWN,5,5,5
18,2004,TWN,7,7,7
29,2005,TWN,1,1,1
40,2006,TWN,7,7,7
51,2007,TWN,2,2,2
63,2008,TWN,1,1,1
101,2010,TWN,5,5,5
134,2011,TWN,21,21,21
171,2012,TWN,38,38,38
214,2013,TWN,34,34,34
