In [4]:
#paper name

#motivation (of paper)

# 1. cell ignore, then walkthrough

# am beispiel Lebenserwartung...

In [5]:
#############################################
# imports                                   #
#############################################

from typing import Callable, Tuple, List, DefaultDict, Set
import heapq
from collections import defaultdict
import pickle
import hashlib
from unicodedata import numeric
import pandas as pd
from pathlib import Path
from collections import Counter

from qcr import load_index
from qcr import save_index
from qcr import load_tables
from qcr import load_query
from qcr import get_table_id
from qcr import hash_function


## A Sketch-based Index for Correlated Dataset Search

Discovering Data with joinable keys and correlated data.

Steps of the algorithm:
1. build index of tables in DB
    - split larger tables into 2-col-tables (cross product)
    - creating a sketch of size n of each table in the Database
    - build the index
    
    
2. query
    - creating a sketch of the query table
    - find correlated and joinable tables for the query-table
(finding set overlap between the two sketches ?)
    

### building the index:

1. all tables with more then 2 columns are split:
    - all numerical columns are combined with all columns containig categorical values
2. all numeric values (k) per table are hashed and stored as tuple with their categorical value (=key) c_k
    - <h(k), c_k>
    - for performance reason the sketch size is applied here: only the smallest n hashed calues are kept
3. all categorical keys (c_k) are modified according to their hashed values (h(k))
    - if the hashed value is below or above the median of all table values, c_k is categorized in -c_k or +c_k respectively
    - <h(k), +/-c_k>
    - this is used to identify corrolation


4. a _mirror image_ is of each sketch is build
    - sign of value and key is inverted
    - this is used to identify inversed correlation

3. picking a specific sample (=sketch) of size n per table, using n tuples with smallest hash-value
    - this way the samples are comparable


In [None]:
## sample data

data:
t1
# land # raucherquote #

t2
# land # alkoholkonsum #

t3
# land # km_2 fläche #

In [6]:
# 0. load table and initialize index

inverted_index = load_index()
tables = load_tables()

## sample run:

table = tables [0] # table = c1

print(table)

A_0   id     value
0    BCI -0.128111
1    JUX -0.533593
2    DPV -0.147960
3    TWW  0.925996
4    FIU  0.385326
5    HJQ -0.011788
6    MNT -0.413050
7    CJS -0.375886
8    KNO -0.047905
9    ACI -0.513682
10   BMN  0.614774
11   DDE -0.224338
12   KMX  0.093107
13   GIL  0.293723
14   IVV -0.425436
15   EHY -0.336279
16   HLQ -0.033187
17   DHL -0.373409
18   EPZ -0.045107
19   DOS  0.040730
20   BBZ  0.107688
21   HHO -0.576108
22   DWW -0.199871
23   FOO  0.891773
24   AMX -0.058327
25   IPV -0.557524
26   AQW -0.650432
27   HZZ  0.234087
28   KKU  0.379032
29   ATZ  0.679210
30   BFG -0.206311
31   IMX  0.063299
32   IPQ  0.346967
33   XXZ -0.277239


In [7]:
# 1. extracting numerical column (feature) and categorical column (key)

def get_kc(table: pd.DataFrame) -> List[str]:
    KC_column_name = table.select_dtypes(include=["object"]).columns[0]
    return table[KC_column_name].values.tolist()


def get_c(table: pd.DataFrame) -> List[numeric]:
    C_column_name = table.select_dtypes(include=["float64", "int64"]).columns[0]
    return table[C_column_name].values.tolist()


KC = get_kc(table)
C = get_c(table)

print("KC:")
print(KC)
print("C:")
print(C)

KC:
['BCI', 'JUX', 'DPV', 'TWW', 'FIU', 'HJQ', 'MNT', 'CJS', 'KNO', 'ACI', 'BMN', 'DDE', 'KMX', 'GIL', 'IVV', 'EHY', 'HLQ', 'DHL', 'EPZ', 'DOS', 'BBZ', 'HHO', 'DWW', 'FOO', 'AMX', 'IPV', 'AQW', 'HZZ', 'KKU', 'ATZ', 'BFG', 'IMX', 'IPQ', 'XXZ']
C:
[-0.1281105543723016, -0.5335927281069821, -0.1479597416245595, 0.9259963149332002, 0.3853261298618304, -0.0117876087427923, -0.4130504301642083, -0.375886187354391, -0.0479053088996654, -0.5136823870979241, 0.6147741925572212, -0.2243381689929853, 0.0931069692600516, 0.2937234790383699, -0.425435991029031, -0.3362794162285384, -0.033187001449767, -0.3734093520687084, -0.0451071161146956, 0.040729933495606, 0.1076883846738036, -0.5761081689656187, -0.1998714257897736, 0.8917731795254693, -0.0583273572732885, -0.5575243565276304, -0.6504318687617083, 0.2340872510127786, 0.3790319895899433, 0.6792104811146353, -0.2063110598812824, 0.0632990467993411, 0.3469673544678994, -0.2772392610871207]


In [None]:
# create hash functions h and hu

def create_hash_functions() -> Tuple[Callable[[str], int], Callable[[int], int]]:

    return (
        lambda x: int.from_bytes(
            hashlib.md5(x.encode("utf-8")).digest(), "little", signed=True
        ),
        lambda x: int.from_bytes(
            hashlib.md5(str(x).encode("utf-8")).digest(), "little", signed=True
        ) / 2 ** 256,
    )


h, hu = create_hash_functions()  
# both hash functions are used to create the sketch (hu(h(k)))

In [None]:
# 2. hash numerical values

def create_sketch(
    KC: List[str],                  C: List[numeric],
    h: Callable[[str], int],        hu: Callable[[int], int],       n=100,
    ) -> List[Tuple[str, numeric]]:
    
    sketch = heapq.nsmallest(n, zip(KC, C), key=lambda x: hu(h(x[0])))
    ##       heapq.nsmallest(n, iterable  , key=funktion)
    ##       n = return n smalllest results
    ##       iterable = perform function on this data
    ##       key = use this function, pick n smallest results and put on sorted heap (heapque)
    
    return sketch

sketch = create_sketch(KC, C, h, hu)

print(sketch)

In [None]:
# replace this with step by step sample î

In [None]:
# 3. categorize keys by value and use as new key

def generate_term_keys(sketch: List[Tuple[str, numeric]], h: Callable[[str], int]) -> List[int]:

    # mue = median of all values of this tables numeric column
    mue = sum([value for key, value in sketch]) / len(sketch)
    
    # categorize key by > median (+key) or < median (-key)
    # hash categorized term
    categorized_key = [h(f'{h(key)}{"+1" if value > mue else "-1"}') for key, value in sketch]
    
    return categorized_key


hashed_terms = generate_term_keys(sketch, h)

print(hashed_terms)

In [None]:
## toy example to vizualize steps

# TODO

In [None]:
# 4. store hashed and categorized terms  (not sketch?)  in inverted index ???

def add_to_inverted_index( inverted_index: DefaultDict[int, Set[str]],
                           hashed_terms: List[int], 
                           table_id: str                                  ) -> None:

    for term in hashed_terms:
        inverted_index[term].add(table_id)

table_id = get_table_id(table)
add_to_inverted_index(inverted_index, hashed_terms, table_id)

print(inverted_index)
# 130988216787560463481282035084846401787: {'A_0'}
# hashed categorized key/term

In [None]:
### the above is an example of one table, the full code can be foind in qcr.py

# now we also import the other 2 tables into the index
import qcr
c1 = pd.DataFrame([[],[]])
c2 = pd.DataFrame([[],[]])
qcr.build_index([c1, c2])

In [None]:
# query table: (key & taget)
q = pd.DataFrame([[],[]])
# land # lebenserwartung #
# ...


# as above:
# 1. build sketch of query table
sketch = qcr.create_sketch(q[0], q[1], hash_function(), n=5)
# 2. generate terms
terms = qcr.generate_term_keys(sketch)

# 3.  inverse values of sketch
# for negative correlation
inverse_terms = generate_term_keys(
    list(map((lambda key_value: (key_value[0], -key_value[1])), sketch)), h
    ) # same function as above, input is inverted




In [None]:
# execute query
# 1. load idex
inverted_index = load_index()

# count how many tables match the sketches terms
result = Counter()
result.update(
    "+:" + table_id for term in terms for table_id in inverted_index[term]
)
result.update(
    "-:" + table_id for term in inverse_terms for table_id in inverted_index[term]
)

sketch = result.most_common(10)

In [None]:
### Query dataset (Q)

df_Q = pd.DataFrame({'movies':['A','B','C','D','E'], 
                     'budget in mil €':[100, 200, 500, 300, 300],
                     'stars':[2,3,4,4.8,3]})
display(df_Q)

print()
print('hashed df:')
h_k = df_Q['budget in mil €'].apply(lambda k: hash(k))
df_Q['budget in mil €'] = h_k

h_k = df_Q['stars'].apply(lambda k: hash(k))
df_Q['stars'] = h_k

display(df_Q)

In [None]:
### example of a corrolated and joinable dataset from the corpus (c):

df_c = pd.DataFrame({'movies':['C','D','E','F','G',], 
                     'budget per staff':[1.2, 3.5, 8, 10, 4]})
display(df_c)

h_k = df_c['budget per staff'].apply(lambda k: hash(k))
df_c['budget per staff'] = h_k
display(df_c)
#df = pd.DataFrame({'movies':['C','D','E','F','G',], 'sick days':[500, 50, 150, 175, 100]})
#df

In [None]:
# choose sketch for each table

### correlation:



In [None]:
# find corrolated sketches and joinable sketches


### joinability:

In [None]:
#############################################
# building the index                        #
#                        overview           #
#############################################
def build_index() -> None:
    # 0. load table and initialize index
    inverted_index = defaultdict(set)
    tables = load_tables()

    # create sketch for every table and add it to the intex
    for table in tables:

        # 1. build 2-column-tables
        KC = get_kc(table)
        C = get_c(table)

        # create hash functions
        h, hu = create_hash_functions(KC, C)

        # 2. hash numerical values
        sketch = create_sketch(KC, C, h, hu)

        # 3. categorize keys by value
        terms = generate_term_keys(sketch, h)


        # 4. store terms // sketch in inverted index ???
        table_id = get_table_id(table)
        add_to_inverted_index(inverted_index, terms, table_id)


    save_index(inverted_index)

In [None]:
def find_tables(query: pd.DataFrame) -> List[str]:
    # 1. build 2-column-tables
    KC = get_kc(query)
    C = get_c(query)
    
    # create hash functions
    h, hu = create_hash_functions()  # both hash functions are used to create the sketch (hu(h(k)))
    
    # hash numerical values
    sketch = create_sketch(KC, C, h, hu)
    
    # categorize keys by value
    terms = generate_term_keys(sketch, h)
    
    # mirror image
    anti_terms = tk(
        list(map((lambda key_value: (key_value[0], -key_value[1])), sketch)), h
        )
    
    # pick smallest n terms for specific sample
    inverted_index = load_index()
    result = Counter()
    result.update(
        "+:" + table_id for term in terms for table_id in inverted_index[term]
    )
    result.update(
        "-:" + table_id for term in anti_terms for table_id in inverted_index[term]
    )
    
    sketch = result.most_common(10)
    return sketch

In [None]:
# mirror image of tuples (same function as above, mirrored input)

mirror_image = list(map((lambda key_value: (key_value[0], -key_value[1])), sketch)), h

anti_terms = generate_term_keys( mirror_image )

In [None]:
# pick smallest n terms for specific sample
    inverted_index = load_index()
    result = Counter()
    result.update(
        "+:" + table_id for term in terms for table_id in inverted_index[term]
    )
    result.update(
        "-:" + table_id for term in anti_terms for table_id in inverted_index[term]
    )
    
    sketch = result.most_common(10)
    return sketch

130988216787560463481282035084846401787
-164498018166270460019726497310340920573
5949854918686469283656396133996932657
-159667670477757348212119561389798475760
-123518883473990951742242060116156631840
-153874510321718400393200698335856524090
70889870575502088878342233457187125602
-103642463282839860330255785527869227373
-107652028331406107376014582386653940604
169670371384228868865829084865906723728
13430687092371639548959914221081430938
-163405778099304275901291005258578865528
38547144166318634051688885511327055193
50713401525494693548196096710288557761
-12666881906914606031712355037526090798
66624635521653478560655675784325324453
-158889586620291573373844578574685643319
167831168118952399679844763627985461912
-45922898634068634104107350093192440227
-35516288707044414466829460195387135299
-126702777333477576826350699968714715348
92614370021379324331431456951883824963
131403182100870069080875901334271267982
-104234191481797814235810275821827468786
11199091245062852640244356660245418822

In [17]:
# 2. hash numerical values

def create_sketch(
    KC: List[str],                  C: List[numeric],
    h: Callable[[str], int],        hu: Callable[[int], int],       n=100,
    ) -> List[Tuple[str, numeric]]:
    
    sketch = heapq.nsmallest(n, zip(KC, C), key=lambda x: hu(h(x[0])))
    ##       heapq.nsmallest(n, iterable  , key=funktion)
    ##       n = return n smalllest results
    ##       iterable = perform function on this data
    ##       key = use this function, pick n smallest results and put on sorted heap (heapque)
    
    return sketch

sketch = create_sketch(KC, C, h, hu)

print(sketch)

[('HZZ', 0.2340872510127786), ('XXZ', -0.2772392610871207), ('HLQ', -0.033187001449767), ('AMX', -0.0583273572732885), ('IPQ', 0.3469673544678994), ('HHO', -0.5761081689656187), ('DOS', 0.040729933495606), ('BCI', -0.1281105543723016), ('BBZ', 0.1076883846738036), ('CJS', -0.375886187354391), ('DWW', -0.1998714257897736), ('TWW', 0.9259963149332002), ('FIU', 0.3853261298618304), ('IMX', 0.0632990467993411), ('AQW', -0.6504318687617083), ('BFG', -0.2063110598812824), ('KMX', 0.0931069692600516), ('ACI', -0.5136823870979241), ('EPZ', -0.0451071161146956), ('IVV', -0.425435991029031), ('FOO', 0.8917731795254693), ('HJQ', -0.0117876087427923), ('EHY', -0.3362794162285384), ('DHL', -0.3734093520687084), ('IPV', -0.5575243565276304), ('DPV', -0.1479597416245595), ('KKU', 0.3790319895899433), ('DDE', -0.2243381689929853), ('BMN', 0.6147741925572212), ('GIL', 0.2937234790383699), ('MNT', -0.4130504301642083), ('JUX', -0.5335927281069821), ('KNO', -0.0479053088996654), ('ATZ', 0.679210481114635

In [18]:
# 3. categorize keys by value and use as new key

def generate_term_keys(sketch: List[Tuple[str, numeric]], h: Callable[[str], int]) -> List[int]:

    # mue = median of all values of this tables numeric column
    mue = sum([value for key, value in sketch]) / len(sketch)
    
    # categorize key by > median (+key) or < median (-key)
    # hash categorized term
    categorized_key = [h(f'{h(key)}{"+1" if value > mue else "-1"}') for key, value in sketch]
    
    return categorized_key


hashed_terms = generate_term_keys(sketch, h)

print(hashed_terms)

[130988216787560463481282035084846401787, -164498018166270460019726497310340920573, 5949854918686469283656396133996932657, -159667670477757348212119561389798475760, -123518883473990951742242060116156631840, -153874510321718400393200698335856524090, 70889870575502088878342233457187125602, -103642463282839860330255785527869227373, -107652028331406107376014582386653940604, 169670371384228868865829084865906723728, 13430687092371639548959914221081430938, -163405778099304275901291005258578865528, 38547144166318634051688885511327055193, 50713401525494693548196096710288557761, -12666881906914606031712355037526090798, 66624635521653478560655675784325324453, -158889586620291573373844578574685643319, 167831168118952399679844763627985461912, -45922898634068634104107350093192440227, -35516288707044414466829460195387135299, -126702777333477576826350699968714715348, 92614370021379324331431456951883824963, 131403182100870069080875901334271267982, -104234191481797814235810275821827468786, 1119909124506

In [None]:
## toy example to vizualize steps

# TODO

In [20]:
# 4. store hashed and categorized terms  (not sketch?)  in inverted index ???

def add_to_inverted_index( inverted_index: DefaultDict[int, Set[str]],
                           hashed_terms: List[int], 
                           table_id: str                                  ) -> None:

    for term in hashed_terms:
        inverted_index[term].add(table_id)

table_id = get_table_id(table)
add_to_inverted_index(inverted_index, hashed_terms, table_id)

print(inverted_index)
# 130988216787560463481282035084846401787: {'A_0'}
# hashed categorized key/term

defaultdict(<class 'set'>, {130988216787560463481282035084846401787: {'A_0'}, -164498018166270460019726497310340920573: {'A_0'}, 5949854918686469283656396133996932657: {'A_0'}, -159667670477757348212119561389798475760: {'A_0'}, -123518883473990951742242060116156631840: {'A_0'}, -153874510321718400393200698335856524090: {'A_0'}, 70889870575502088878342233457187125602: {'A_0'}, -103642463282839860330255785527869227373: {'A_0'}, -107652028331406107376014582386653940604: {'A_0'}, 169670371384228868865829084865906723728: {'A_0'}, 13430687092371639548959914221081430938: {'A_0'}, -163405778099304275901291005258578865528: {'A_0'}, 38547144166318634051688885511327055193: {'A_0'}, 50713401525494693548196096710288557761: {'A_0'}, -12666881906914606031712355037526090798: {'A_0'}, 66624635521653478560655675784325324453: {'A_0'}, -158889586620291573373844578574685643319: {'A_0'}, 167831168118952399679844763627985461912: {'A_0'}, -45922898634068634104107350093192440227: {'A_0'}, -35516288707044414466

In [34]:
### Query dataset (Q)

df_Q = pd.DataFrame({'movies':['A','B','C','D','E'], 
                     'budget in mil €':[100, 200, 500, 300, 300],
                     'stars':[2,3,4,4.8,3]})
display(df_Q)

print()
print('hashed df:')
h_k = df_Q['budget in mil €'].apply(lambda k: hash(k))
df_Q['budget in mil €'] = h_k

h_k = df_Q['stars'].apply(lambda k: hash(k))
df_Q['stars'] = h_k

display(df_Q)

Unnamed: 0,movies,budget in mil €,stars
0,A,100,2.0
1,B,200,3.0
2,C,500,4.0
3,D,300,4.8
4,E,300,3.0



hashed df:


Unnamed: 0,movies,budget in mil €,stars
0,A,0.1,0.002
1,B,0.2,0.003
2,C,0.5,0.004
3,D,0.3,0.0048
4,E,0.3,0.003


In [36]:
### example of a corrolated and joinable dataset from the corpus (c):

df_c = pd.DataFrame({'movies':['C','D','E','F','G',], 
                     'budget per staff':[1.2, 3.5, 8, 10, 4]})
display(df_c)

h_k = df_c['budget per staff'].apply(lambda k: hash(k))
df_c['budget per staff'] = h_k
display(df_c)
#df = pd.DataFrame({'movies':['C','D','E','F','G',], 'sick days':[500, 50, 150, 175, 100]})
#df

Unnamed: 0,movies,budget per staff
0,C,1.2
1,D,3.5
2,E,8.0
3,F,10.0
4,G,4.0


Unnamed: 0,movies,budget per staff
0,C,0.0012
1,D,0.0035
2,E,0.008
3,F,0.01
4,G,0.004


In [None]:
# choose sketch for each table

### correlation:



In [None]:
# find corrolated sketches and joinable sketches


### joinability:

In [None]:
def find_tables(query: pd.DataFrame) -> List[str]:
    # 1. build 2-column-tables
    KC = get_kc(query)
    C = get_c(query)
    
    # create hash functions
    h, hu = create_hash_functions()  # both hash functions are used to create the sketch (hu(h(k)))
    
    # hash numerical values
    sketch = create_sketch(KC, C, h, hu)
    
    # categorize keys by value
    terms = generate_term_keys(sketch, h)
    
    # mirror image
    anti_terms = tk(
        list(map((lambda key_value: (key_value[0], -key_value[1])), sketch)), h
        )
    
    # pick smallest n terms for specific sample
    inverted_index = load_index()
    result = Counter()
    result.update(
        "+:" + table_id for term in terms for table_id in inverted_index[term]
    )
    result.update(
        "-:" + table_id for term in anti_terms for table_id in inverted_index[term]
    )
    
    sketch = result.most_common(10)
    return sketch

In [None]:
# mirror image of tuples (same function as above, mirrored input)

mirror_image = list(map((lambda key_value: (key_value[0], -key_value[1])), sketch)), h

anti_terms = generate_term_keys( mirror_image )

In [None]:
# pick smallest n terms for specific sample
    inverted_index = load_index()
    result = Counter()
    result.update(
        "+:" + table_id for term in terms for table_id in inverted_index[term]
    )
    result.update(
        "-:" + table_id for term in anti_terms for table_id in inverted_index[term]
    )
    
    sketch = result.most_common(10)
    return sketch

In [91]:
import pandas as pd
df = pd.read_csv('data/Life_Expectancy_Data.csv', sep=';')
df['Country'] = df['Country'].str.strip()

In [92]:
df = df.groupby(['Country']).mean().reset_index(level=0)
df_small = df[['Country','Life expectancy ', 'Alcohol', ' BMI ']]
df_small

Unnamed: 0,Country,Life expectancy,Alcohol,BMI
0,Afghanistan,58.19375,0.014375,15.51875
1,Albania,75.15625,4.848750,49.06875
2,Algeria,73.61875,0.406667,48.74375
3,Angola,49.01875,5.740667,18.01875
4,Antigua and Barbuda,75.05625,7.949333,38.42500
...,...,...,...,...
188,Venezuela (Bolivarian Republic of),73.38750,7.420000,54.48750
189,Viet Nam,74.77500,3.087333,11.18750
190,Yemen,63.86250,0.047333,33.48750
191,Zambia,53.90625,2.239333,17.45000


In [93]:
df_smaller = df_small.loc[df_small['Country'].isin(['Nigeria', 'Equatorial Guinea', 'Kazakhstan', 'Paraguay', 'Colombia', 'Armenia', 'Nicaragua', 'Montenegro', 'Maledives', 'Quatar', 'Germany', 'United States of America'])]
q = df_smaller[['Country', 'Life expectancy ']]
c1 = df_smaller[['Country', 'Alcohol']]
c2 = df_smaller[['Country', ' BMI ']]
df_smaller


Unnamed: 0,Country,Life expectancy,Alcohol,BMI
6,Armenia,73.4,3.702667,44.70625
35,Colombia,73.2875,4.419333,49.54375
54,Equatorial Guinea,55.3125,7.342,17.85625
64,Germany,81.175,11.628667,51.99375
86,Kazakhstan,66.7625,6.641333,45.15625
112,Montenegro,74.5,2.584286,50.4875
121,Nicaragua,73.45,3.596667,42.68125
123,Nigeria,51.35625,8.646667,19.75
131,Paraguay,73.1125,5.527333,39.525
184,United States of America,78.0625,8.579333,58.45


In [100]:
df2 = pd.read_csv('data/countries_of_the_world.csv')
display(df2)
#display(df2['Country'])
#display(df_small['Country'])
#dfx = df2[['Country']].join(df_small[['Country']], lsuffix='_who', rsuffix='_kaggle')
#dfx
#df_merge = pd.merge(df_small, df2, on='Country',  how='left')
#df_merge
display(df2.dtypes)
display(df_small.dtypes)
df_join = df_small.join(df2, on='Country',  how='left', lsuffix='_who', rsuffix='_kaggle')
df_join

Unnamed: 0,Country,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,480,000,2306,16307,700.0,360,32,1213,022,8765,1,466,2034,038,024,038
1,Albania,EASTERN EUROPE,3581655,28748,1246,126,-493,2152,4500.0,865,712,2109,442,7449,3,1511,522,0232,0188,0579
2,Algeria,NORTHERN AFRICA,32930091,2381740,138,004,-039,31,6000.0,700,781,322,025,9653,1,1714,461,0101,06,0298
3,American Samoa,OCEANIA,57794,199,2904,5829,-2071,927,8000.0,970,2595,10,15,75,2,2246,327,,,
4,Andorra,WESTERN EUROPE,71201,468,1521,000,66,405,19000.0,1000,4972,222,0,9778,3,871,625,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,West Bank,NEAR EAST,2460492,5860,4199,000,298,1962,800.0,,1452,169,1897,6413,3,3167,392,009,028,063
223,Western Sahara,NORTHERN AFRICA,273008,266000,10,042,,,,,,002,0,9998,1,,,,,04
224,Yemen,NEAR EAST,21456188,527970,406,036,0,615,800.0,502,372,278,024,9698,1,4289,83,0135,0472,0393
225,Zambia,SUB-SAHARAN AFRICA,11502010,752614,153,000,0,8829,800.0,806,82,708,003,929,2,41,1993,022,029,0489


Country                                object
Region                                 object
Population                              int64
Area (sq. mi.)                          int64
Pop. Density (per sq. mi.)             object
Coastline (coast/area ratio)           object
Net migration                          object
Infant mortality (per 1000 births)     object
GDP ($ per capita)                    float64
Literacy (%)                           object
Phones (per 1000)                      object
Arable (%)                             object
Crops (%)                              object
Other (%)                              object
Climate                                object
Birthrate                              object
Deathrate                              object
Agriculture                            object
Industry                               object
Service                                object
dtype: object

Country              object
Life expectancy     float64
Alcohol             float64
 BMI                float64
dtype: object

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [127]:
import pandas as pd
df = pd.read_csv('data/Life_Expectancy_Data.csv', sep=';')
df['Country'] = df['Country'].str.strip()
df = df.groupby(['Country']).mean().reset_index(level=0)
df_small = df[['Country','Life expectancy ', 'Alcohol', ' BMI ']]
df_small


df2 = pd.read_csv('data/countries_of_the_world.csv')
display(df2)
#display(df2['Country'])
#display(df_small['Country'])
#dfx = df2[['Country']].join(df_small[['Country']], lsuffix='_who', rsuffix='_kaggle')
#dfx
#df_merge = pd.merge(df_small, df2, on='Country',  how='left')
#df_merge

df2 = df2.astype({'Country': 'string'})
df_small = df_small.astype({'Country': 'string'})

df2 = df2.set_index('Country')
display(df2)
df_small = df_small.set_index('Country')
display(df_small)

#df_obj = df2.select_dtypes(['object'])
#df2[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

#df_obj = df_small.select_dtypes(['object'])
#df_small[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())



display(df2.dtypes)
display(df_small.dtypes)
#display(df2['Country'])
#display(df_small['Country'])
df_join = df_small.join(df2, lsuffix='_who', rsuffix='_kaggle')
df_join

Unnamed: 0,Country,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,480,000,2306,16307,700.0,360,32,1213,022,8765,1,466,2034,038,024,038
1,Albania,EASTERN EUROPE,3581655,28748,1246,126,-493,2152,4500.0,865,712,2109,442,7449,3,1511,522,0232,0188,0579
2,Algeria,NORTHERN AFRICA,32930091,2381740,138,004,-039,31,6000.0,700,781,322,025,9653,1,1714,461,0101,06,0298
3,American Samoa,OCEANIA,57794,199,2904,5829,-2071,927,8000.0,970,2595,10,15,75,2,2246,327,,,
4,Andorra,WESTERN EUROPE,71201,468,1521,000,66,405,19000.0,1000,4972,222,0,9778,3,871,625,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,West Bank,NEAR EAST,2460492,5860,4199,000,298,1962,800.0,,1452,169,1897,6413,3,3167,392,009,028,063
223,Western Sahara,NORTHERN AFRICA,273008,266000,10,042,,,,,,002,0,9998,1,,,,,04
224,Yemen,NEAR EAST,21456188,527970,406,036,0,615,800.0,502,372,278,024,9698,1,4289,83,0135,0472,0393
225,Zambia,SUB-SAHARAN AFRICA,11502010,752614,153,000,0,8829,800.0,806,82,708,003,929,2,41,1993,022,029,0489


Unnamed: 0_level_0,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,480,000,2306,16307,700.0,360,32,1213,022,8765,1,466,2034,038,024,038
Albania,EASTERN EUROPE,3581655,28748,1246,126,-493,2152,4500.0,865,712,2109,442,7449,3,1511,522,0232,0188,0579
Algeria,NORTHERN AFRICA,32930091,2381740,138,004,-039,31,6000.0,700,781,322,025,9653,1,1714,461,0101,06,0298
American Samoa,OCEANIA,57794,199,2904,5829,-2071,927,8000.0,970,2595,10,15,75,2,2246,327,,,
Andorra,WESTERN EUROPE,71201,468,1521,000,66,405,19000.0,1000,4972,222,0,9778,3,871,625,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
West Bank,NEAR EAST,2460492,5860,4199,000,298,1962,800.0,,1452,169,1897,6413,3,3167,392,009,028,063
Western Sahara,NORTHERN AFRICA,273008,266000,10,042,,,,,,002,0,9998,1,,,,,04
Yemen,NEAR EAST,21456188,527970,406,036,0,615,800.0,502,372,278,024,9698,1,4289,83,0135,0472,0393
Zambia,SUB-SAHARAN AFRICA,11502010,752614,153,000,0,8829,800.0,806,82,708,003,929,2,41,1993,022,029,0489


Unnamed: 0_level_0,Life expectancy,Alcohol,BMI
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,58.19375,0.014375,15.51875
Albania,75.15625,4.848750,49.06875
Algeria,73.61875,0.406667,48.74375
Angola,49.01875,5.740667,18.01875
Antigua and Barbuda,75.05625,7.949333,38.42500
...,...,...,...
Venezuela (Bolivarian Republic of),73.38750,7.420000,54.48750
Viet Nam,74.77500,3.087333,11.18750
Yemen,63.86250,0.047333,33.48750
Zambia,53.90625,2.239333,17.45000


Region                                 object
Population                              int64
Area (sq. mi.)                          int64
Pop. Density (per sq. mi.)             object
Coastline (coast/area ratio)           object
Net migration                          object
Infant mortality (per 1000 births)     object
GDP ($ per capita)                    float64
Literacy (%)                           object
Phones (per 1000)                      object
Arable (%)                             object
Crops (%)                              object
Other (%)                              object
Climate                                object
Birthrate                              object
Deathrate                              object
Agriculture                            object
Industry                               object
Service                                object
dtype: object

Life expectancy     float64
Alcohol             float64
 BMI                float64
dtype: object

Unnamed: 0_level_0,Life expectancy,Alcohol,BMI,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),...,Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,58.19375,0.014375,15.51875,,,,,,,,...,,,,,,,,,,
Albania,75.15625,4.848750,49.06875,,,,,,,,...,,,,,,,,,,
Algeria,73.61875,0.406667,48.74375,,,,,,,,...,,,,,,,,,,
Angola,49.01875,5.740667,18.01875,,,,,,,,...,,,,,,,,,,
Antigua and Barbuda,75.05625,7.949333,38.42500,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Venezuela (Bolivarian Republic of),73.38750,7.420000,54.48750,,,,,,,,...,,,,,,,,,,
Viet Nam,74.77500,3.087333,11.18750,,,,,,,,...,,,,,,,,,,
Yemen,63.86250,0.047333,33.48750,,,,,,,,...,,,,,,,,,,
Zambia,53.90625,2.239333,17.45000,,,,,,,,...,,,,,,,,,,
