# Data preprocessing 資料前處理

## Cleaning numerical literals 將資料夾中numerical literals清理

In [2]:
import pandas as pd
import numpy as np

In [None]:
path = "LitWD48K/"


# From LiterallyWikidata read data and check data

In [7]:
column_name = ["e","a","v"]
df48_ori = pd.read_csv(f"/projekte/tcl/tclext/kgc_chu/LiterallyWikidata/LitWD48K/numeric_literals.txt",sep="\t",names=column_name)

In [8]:
df48_ori[:5]

Unnamed: 0,e,a,v
0,Q1000056,P1082_Q199,+11032^^<http://www.w3.org/2001/XMLSchema#deci...
1,Q1000056,P2044_Q11573,+472^^<http://www.w3.org/2001/XMLSchema#decimal>
2,Q1000056,P2046_Q25343,+45630000.00^^<http://www.w3.org/2001/XMLSchem...
3,Q1000138,P1082_Q199,+1375^^<http://www.w3.org/2001/XMLSchema#decimal>
4,Q1000138,P2044_Q11573,+1^^<http://www.w3.org/2001/XMLSchema#decimal>


In [9]:
df48 = df48_ori

In [10]:
#取V值
df48[['v','xml']] = df48['v'].str.split('^', 1, expand=True)

In [11]:
# check the number of data
print(f'triple number of df48: {len(df48)}, num of df48_entity: {len(df48["e"].unique())}, num of df48_attribute: {len(df48["a"].unique())}')

triple number of df48: 324418, num of df48_entity: 47998, num of df48_attribute: 291


In [12]:
df48.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324418 entries, 0 to 324417
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   e       324418 non-null  object
 1   a       324418 non-null  object
 2   v       324418 non-null  object
 3   xml     324418 non-null  object
dtypes: object(4)
memory usage: 9.9+ MB


# Discovering some issues within data

## 3 value types

In [13]:
#有三種數值型態
df48['xml'].value_counts()

^<http://www.w3.org/2001/XMLSchema#decimal>     148707
^<http://www.w3.org/2001/XMLSchema#double>      118078
^<http://www.w3.org/2001/XMLSchema#dateTime>     57633
Name: xml, dtype: int64

## Time (year-month-date)

In [14]:
# value type: time
df48[df48['xml']=='^<http://www.w3.org/2001/XMLSchema#dateTime>'][:5]

Unnamed: 0,e,a,v,xml
148707,Q1000051,P569,1884-11-05T00:00:00Z,^<http://www.w3.org/2001/XMLSchema#dateTime>
148708,Q1000051,P570,1962-12-01T00:00:00Z,^<http://www.w3.org/2001/XMLSchema#dateTime>
148709,Q1000056,P1249,1233-01-01T00:00:00Z,^<http://www.w3.org/2001/XMLSchema#dateTime>
148710,Q1000138,P576,2016-12-31T00:00:00Z,^<http://www.w3.org/2001/XMLSchema#dateTime>
148711,Q1000217,P571,1893-01-01T00:00:00Z,^<http://www.w3.org/2001/XMLSchema#dateTime>


## Double: coordinate location

In [9]:
# value type: double (geographic)
# coordinate location in original data is mixed. P625，P1335...(coordinate location)有經緯度
df48[df48["a"].str.contains("P1335")][:5]

Unnamed: 0,e,a,v,xml
206346,Q100015,P1335_Latitude,45.5439988,^<http://www.w3.org/2001/XMLSchema#double>
206351,Q100016,P1335_Latitude,45.8495584,^<http://www.w3.org/2001/XMLSchema#double>
206356,Q100018,P1335_Latitude,45.6671782,^<http://www.w3.org/2001/XMLSchema#double>
206362,Q100036,P1335_Latitude,44.7203568,^<http://www.w3.org/2001/XMLSchema#double>
206367,Q100038,P1335_Latitude,44.7499868,^<http://www.w3.org/2001/XMLSchema#double>


## Decimal

In [10]:
# value type: decimal
# same entity has multiple values 可能同entity在P1087有多個值
df48[df48["a"].str.contains("P1087_Q199")][:5]

Unnamed: 0,e,a,v,xml
1981,Q102664,P1087_Q199,2705,^<http://www.w3.org/2001/XMLSchema#decimal>
3219,Q104148,P1087_Q199,2480,^<http://www.w3.org/2001/XMLSchema#decimal>
3220,Q104148,P1087_Q199,2485,^<http://www.w3.org/2001/XMLSchema#decimal>
3221,Q104148,P1087_Q199,2486,^<http://www.w3.org/2001/XMLSchema#decimal>
3222,Q104148,P1087_Q199,2491,^<http://www.w3.org/2001/XMLSchema#decimal>


In [11]:
# P2299 has 2 units 有兩種QXXX
df48[df48["a"].str.contains("P2299")].a.unique()

array(['P2299_Q550207', 'P2299_Q4917'], dtype=object)

In [12]:
# some are meanful zero value but some are not
df_vzero=df48[df48["v"]=='0'].sort_values(by='a')

In [13]:
df_vzero[:5]

Unnamed: 0,e,a,v,xml
103937,Q574,P1279_Q199,0,^<http://www.w3.org/2001/XMLSchema#decimal>
58871,Q334,P1279_Q199,0,^<http://www.w3.org/2001/XMLSchema#decimal>
41606,Q229,P1279_Q199,0,^<http://www.w3.org/2001/XMLSchema#decimal>
305557,Q51,P1333_Longtiude,0,^<http://www.w3.org/2001/XMLSchema#double>
128034,Q778,P2219_Q199,0,^<http://www.w3.org/2001/XMLSchema#decimal>


In [14]:
print(f"there are total {len(df_vzero)} data which has 0.0 in the value column")
print(f"there are total {len(df_vzero['a'].unique())} data which has 0.0 in the value column")

there are total 163 data which has 0.0 in the value column
there are total 8 data which has 0.0 in the value column


In [18]:
df48[df48["a"].str.contains("P1082")].v

0            +11032
3             +1375
6             +4109
9             +6009
12             +119
            ...    
148685         +706
148688         +712
148691        +8551
148694        +7868
148697    +39144818
Name: v, Length: 22299, dtype: object

# Dealing with issues 處理以上問題: 
DataTime dtype: keep year data 時間單位 西元年 \\
Remove dulplicated data (same e,a but diff v)
Turn into value dtype to float 
所有變數的值成為可計算的值(有意義可讀取的值) \\
Rescale the values within the same attributes 同attribute value rescale \\
Drop out unreasonal 0 value 去0 \\
Drop out the unit (QXXX)去單位 \\
Keep Attribyes at least number of 15 data 每個屬性最少有15筆資料 \\


## value type: time

In [15]:
# 如果"v"是時間，取前四位數年代部分，np.where(condition, x,y)在condition下，把x取代y
df48["new_v"] = df48["v"].map(lambda x: x[:4])
df48.loc[:,"v"] = np.where((df48["xml"].str.contains("dateTime")), df48["new_v"], df48["v"])

In [16]:
df48["v"]=df48["v"].astype('float32')

## value type: double

In [182]:
# remove double data
nogeo_df48 = df48[~df48['xml'].str.contains('double')]

In [183]:
nogeo_df48

Unnamed: 0,e,a,v,xml,new_v
0,Q1000056,P1082_Q199,11032.0,^<http://www.w3.org/2001/XMLSchema#decimal>,+110
1,Q1000056,P2044_Q11573,472.0,^<http://www.w3.org/2001/XMLSchema#decimal>,+472
2,Q1000056,P2046_Q25343,45630000.0,^<http://www.w3.org/2001/XMLSchema#decimal>,+456
3,Q1000138,P1082_Q199,1375.0,^<http://www.w3.org/2001/XMLSchema#decimal>,+137
4,Q1000138,P2044_Q11573,1.0,^<http://www.w3.org/2001/XMLSchema#decimal>,+1
...,...,...,...,...,...
206335,Q99937,P571,1798.0,^<http://www.w3.org/2001/XMLSchema#dateTime>,1798
206336,Q999608,P2031,1989.0,^<http://www.w3.org/2001/XMLSchema#dateTime>,1989
206337,Q999608,P2032,2009.0,^<http://www.w3.org/2001/XMLSchema#dateTime>,2009
206338,Q999608,P569,1971.0,^<http://www.w3.org/2001/XMLSchema#dateTime>,1971


## value type: decimal

In [184]:
#取有100data的att #同時解決同p不同q
filter = nogeo_df48.a.value_counts()
filter_index=filter.index[filter>=100]
nogeo_df48_s = nogeo_df48[nogeo_df48['a'].isin(filter_index)]

In [148]:
nogeo_df48_s.a.value_counts()

P2046_Q25343       24039
P1082_Q199         22299
P2044_Q11573       16734
P569               16305
P571               12294
                   ...  
P2196_Q199           128
P1359_Q199           125
P2547_Q11573         125
P2262_Q11573         113
P2997_Q24564698      101
Name: a, Length: 94, dtype: int64

In [185]:
#處理a，去單位
nogeo_df48_s[["pre_a",'Q']] = nogeo_df48_s["a"].str.split('_', 1,expand=True)
nogeo_df48_s.loc[:,"new_a"] = np.where((nogeo_df48_s["a"].str.contains("Q")), nogeo_df48_s["pre_a"], nogeo_df48_s["a"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [187]:
nogeo_df48_s = nogeo_df48_s.loc[:,['e','new_a','v']]
nogeo_df48_s.columns=['e','a','v']


In [189]:
#處理重複值 drop if [e,a] is the same
nogeo_df48_s = nogeo_df48_s.drop_duplicates(subset=['e','a'],keep='last')

In [None]:
#如果要取平均值:處理重複資料
# df48_s = df48_s.groupby(['e', 'a'], as_index=False).mean()

In [190]:
filter = nogeo_df48_s.a.value_counts()
filter_index=filter.index[filter>=100]
nogeo_df48_s = nogeo_df48_s[nogeo_df48_s['a'].isin(filter_index)]

In [191]:
# check the number of data
print(f'triple number of df48: {len(nogeo_df48_s)}, num of df48_entity: {len(nogeo_df48_s["e"].unique())}, num of df48_attribute: {len(nogeo_df48_s["a"].unique())}')

triple number of df48: 181930, num of df48_entity: 47753, num of df48_attribute: 89


In [192]:
nogeo_df48_s

Unnamed: 0,e,a,v
0,Q1000056,P1082,11032.0
1,Q1000056,P2044,472.0
2,Q1000056,P2046,45630000.0
3,Q1000138,P1082,1375.0
4,Q1000138,P2044,1.0
...,...,...,...
206335,Q99937,P571,1798.0
206336,Q999608,P2031,1989.0
206337,Q999608,P2032,2009.0
206338,Q999608,P569,1971.0


In [30]:
#處理0值
nogeo_df48_s[nogeo_df48_s['v']==0.0]

Unnamed: 0,e,a,v,xml,new_v,pre_a,Q
1551,Q1019532,P2927,0.0,^<http://www.w3.org/2001/XMLSchema#decimal>,0,P2927,Q199
1768,Q1024085,P2927,0.0,^<http://www.w3.org/2001/XMLSchema#decimal>,0,P2927,Q199
2007,Q10268484,P6509,0.0,^<http://www.w3.org/2001/XMLSchema#decimal>,+0,P6509,Q199
2052,Q1027483,P6509,0.0,^<http://www.w3.org/2001/XMLSchema#decimal>,+0,P6509,Q199
2178,Q1028196,P2927,0.0,^<http://www.w3.org/2001/XMLSchema#decimal>,0,P2927,Q199
...,...,...,...,...,...,...,...
148380,Q9954,P2044,0.0,^<http://www.w3.org/2001/XMLSchema#decimal>,+0,P2044,Q11573
148633,Q999608,P6509,0.0,^<http://www.w3.org/2001/XMLSchema#decimal>,+0,P6509,Q199
170447,Q302,P569,-0.0,^<http://www.w3.org/2001/XMLSchema#dateTime>,-000,P569,
177290,Q40662,P569,-0.0,^<http://www.w3.org/2001/XMLSchema#dateTime>,-000,P569,


## zero value

In [193]:
# preparing idmap for checking real name
entity_idmap = {}
with open('Entities/entity_labels_en.txt',encoding="utf-8") as f:
    for line in f:
        k, v = line.strip().split('\t')
        entity_idmap[k.strip()] = v.strip()
attri_idmap={}
with open('Attributes/attribute_labels_en.txt',encoding="utf-8") as f:
    for line in f:
        k, v = line.strip().split('\t')
        attri_idmap[k.strip()] = v.strip()
attri_idmap['P625_Latitude']='coordinate location(latitude)'
attri_idmap['P625_Longtiude']='coordinate location(logtitude)'
attri_idmap['P1332_Longtiude']='coordinates of northernmost point'
attri_idmap['P1333_Longtiude']='coordinates of southernmost point'
attri_idmap['P1334_Longtiude']='coordinates of easternmost point'
attri_idmap['P1335_Longtiude']='coordinates of westernmost point'
nogeo_df48_s.loc[:,'name_e']=nogeo_df48_s['e'].map(entity_idmap)
nogeo_df48_s.loc[:,'name_a']=nogeo_df48_s['a'].map(attri_idmap)

In [31]:
df_zero = nogeo_df48_s[nogeo_df48_s['v']==0.0]
df_zero[['name_e','name_a','v']]

Unnamed: 0,name_e,name_a,v
1551,Markleeville,water as percent of area,0.0
1768,El Sobrante,water as percent of area,0.0
2007,Douglas dos Santos,total goals in career,0.0
2052,Luiz Diallisson de Souza Alves,total goals in career,0.0
2178,Cambria,water as percent of area,0.0
...,...,...,...
148380,Purmerend,elevation above sea level,0.0
148633,Gabriel Caballero,total goals in career,0.0
170447,Jesus,date of birth,-0.0
177290,John the Baptist,date of birth,-0.0


In [None]:
list(df_zero.name_a.unique()),list(df_zero.a.unique())

In [32]:
df = [nogeo_df48_s[nogeo_df48_s['a']==el].describe() for el in list(df_zero.a.unique())]

In [34]:
# for i in range(len(df)):
#     print(list(df_zero.name_a.unique())[i],df[i],list(df_zero.a.unique())[i])

In [194]:
# 刪掉特定列，有11個不合理的0值變數
nogeo_df48_s = nogeo_df48_s.drop(nogeo_df48_s[(nogeo_df48_s['a']=='P6509') & (nogeo_df48_s['v']==0.0)].index)#'total goals in career'
nogeo_df48_s = nogeo_df48_s.drop(nogeo_df48_s[(nogeo_df48_s['a']=='P6544') & (nogeo_df48_s['v']==0.0)].index)#total points in career 
nogeo_df48_s = nogeo_df48_s.drop(nogeo_df48_s[(nogeo_df48_s['a']=='P6543') & (nogeo_df48_s['v']==0.0)].index)#total shots in career
nogeo_df48_s = nogeo_df48_s.drop(nogeo_df48_s[(nogeo_df48_s['a']=='P6546') & (nogeo_df48_s['v']==0.0)].index)#penalty minutes in career
nogeo_df48_s = nogeo_df48_s.drop(nogeo_df48_s[(nogeo_df48_s['a']=='P6545') & (nogeo_df48_s['v']==0.0)].index)#total assists in career 
nogeo_df48_s = nogeo_df48_s.drop(nogeo_df48_s[(nogeo_df48_s['a']=='P1100') & (nogeo_df48_s['v']==0.0)].index)#attendance 
nogeo_df48_s = nogeo_df48_s.drop(nogeo_df48_s[(nogeo_df48_s['a']=='P1279') & (nogeo_df48_s['v']==0.0)].index)##inflation rate
nogeo_df48_s = nogeo_df48_s.drop(nogeo_df48_s[(nogeo_df48_s['a']=='P2121') & (nogeo_df48_s['v']==0.0)].index)#prize money
nogeo_df48_s = nogeo_df48_s.drop(nogeo_df48_s[(nogeo_df48_s['a']=='P3872') & (nogeo_df48_s['v']==0.0)].index)#惠顧人數patronage

In [196]:
nogeo_df48_s.reset_index(inplace=True)


In [198]:
nogeo_df48_s= nogeo_df48_s.drop('index',axis=1)
nogeo_df48_s.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180015 entries, 0 to 180014
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   e       180015 non-null  object 
 1   a       180015 non-null  object 
 2   v       180015 non-null  float32
 3   name_e  180015 non-null  object 
 4   name_a  180015 non-null  object 
dtypes: float32(1), object(4)
memory usage: 6.2+ MB


In [None]:
# 縮小domain

In [144]:
filter = nogeo_df48_s.a.value_counts()
filter_index=filter.index[filter>=100]
nogeo_df48_s = nogeo_df48_s[nogeo_df48_s['a'].isin(filter_index)]

In [199]:
print(f'triple number of df48: {len(nogeo_df48_s)}, num of df48_entity: {len(nogeo_df48_s["e"].unique())}, num of df48_attribute: {len(nogeo_df48_s["a"].unique())}')

triple number of df48: 180015, num of df48_entity: 47753, num of df48_attribute: 89


In [200]:
ent_typemap = {}
with open('Entities/entity_types.txt',encoding="utf-8") as f:
    for line in f:
        k, v = line.strip().split('\t')
        ent_typemap[k.strip()] = v.strip()

In [201]:
nogeo_df48_s['ent_type'] = nogeo_df48_s['e'].map(ent_typemap)

In [202]:
nogeo_df48_s = nogeo_df48_s.loc[:,["e","a","v","name_e","name_a","ent_type"]]

In [203]:
print(f'triple number of df48: {len(nogeo_df48_s)}, num of df48_entity: {len(nogeo_df48_s["e"].unique())}, num of df48_attribute: {len(nogeo_df48_s["a"].unique())}')

triple number of df48: 180015, num of df48_entity: 47753, num of df48_attribute: 89


In [244]:
nogeo_df48_s

Unnamed: 0,e,a,v,name_e,name_a,ent_type
0,Q1000056,P1082,11032.0,Sušice,population,Q7841907
1,Q1000056,P2044,472.0,Sušice,elevation above sea level,Q7841907
2,Q1000056,P2046,45630000.0,Sušice,area,Q7841907
3,Q1000138,P1082,1375.0,Cantenac,population,Q484170
4,Q1000138,P2044,1.0,Cantenac,elevation above sea level,Q484170
...,...,...,...,...,...,...
180010,Q99937,P571,1798.0,Cesenatico,inception,Q747074
180011,Q999608,P2031,1989.0,Gabriel Caballero,work period (start),Q5
180012,Q999608,P2032,2009.0,Gabriel Caballero,work period (end),Q5
180013,Q999608,P569,1971.0,Gabriel Caballero,date of birth,Q5


# type specific

In [255]:
row_idx = []
def gather_data(name_att,df):
    df = df.reset_index(drop=True)
    a_type = list(df[df['name_a'].str.contains(name_att)].ent_type.unique())
    for type in a_type:
        list_type=list(df[df['ent_type']==type].index)
        row_idx.extend(list_type)
    return row_idx

In [258]:
idx= gather_data("population",nogeo_df48_s)
pop_related_att = nogeo_df48_s.iloc[idx,:]
pop_related_att.reset_index(inplace=True)
pop_related_att=pop_related_att.drop('index',axis=1)

In [259]:
pop_related_att.to_csv('files_needed/pop_related_att',index=False)

In [263]:
nogeo_df48_s[nogeo_df48_s.name_a.str.contains("work")]

Unnamed: 0,e,a,v,name_e,name_a,ent_type,std_v
125362,Q1001214,P2031,1928.0,Buddy Ebsen,work period (start),Q5,-0.030996
125377,Q100440,P2031,1950.0,Larry Hagman,work period (start),Q5,-0.030996
125378,Q100440,P2032,2012.0,Larry Hagman,work period (end),Q5,-0.030994
125411,Q10118,P2031,2003.0,Victoria Azarenka,work period (start),Q5,-0.030994
125415,Q10120,P2031,2008.0,Laura Robson,work period (start),Q5,-0.030994
...,...,...,...,...,...,...,...
179970,Q995314,P2031,1993.0,Bryon Russell,work period (start),Q5,-0.030994
179971,Q995314,P2032,2009.0,Bryon Russell,work period (end),Q5,-0.030994
179977,Q9960,P2031,1937.0,Ronald Reagan,work period (start),Q5,-0.030996
180011,Q999608,P2031,1989.0,Gabriel Caballero,work period (start),Q5,-0.030994


In [239]:
idx2= gather_data("date of birth",nogeo_df48_s)
people_related_att = nogeo_df48_s.iloc[idx2,:]


In [40]:
var_name = ["population","GDP (PPP)","PPP GDP per capita",
      "date of birth","date of death", "life expectancy",
      "total revenue","net profit",
      "area",
      "retirement age","age of majority","work period (start)","work period (end)"]

In [242]:
people_related_att.name_a.value_counts()

population                           21446
area                                 21367
date of birth                        15932
elevation above sea level            15611
mass                                  8618
                                     ...  
publication date                         1
daily patronage                          1
number of points/goals/set scored        1
number of platform tracks                1
discharge                                1
Name: name_a, Length: 72, dtype: int64

In [45]:
idx_all = []
for var in var_name:
    idx_all = gather_data(var,nogeo_df48_s)


  after removing the cwd from sys.path.


In [47]:
idx_all

[0,
 1,
 2,
 682,
 683,
 684,
 1014,
 1015,
 1016,
 1405,
 1406,
 1407,
 26848,
 26849,
 26850,
 26851,
 75700,
 75701,
 75702,
 77861,
 77862,
 77863,
 79239,
 79240,
 79241,
 88361,
 88362,
 88363,
 96689,
 96690,
 96691,
 96692,
 97389,
 97390,
 97391,
 99044,
 99045,
 99046,
 99753,
 99754,
 99755,
 100804,
 100805,
 100806,
 110112,
 110113,
 110114,
 114187,
 114188,
 114189,
 117175,
 117176,
 117177,
 120131,
 120132,
 120133,
 122913,
 122914,
 122915,
 124100,
 124101,
 124102,
 124812,
 124813,
 124814,
 124815,
 124816,
 124817,
 124818,
 124819,
 124820,
 124824,
 124825,
 124826,
 124827,
 124910,
 124911,
 124912,
 125640,
 125684,
 125685,
 125723,
 125764,
 137021,
 137022,
 159304,
 159987,
 160263,
 160264,
 163919,
 163920,
 167459,
 167802,
 168507,
 168794,
 169208,
 173959,
 175432,
 176723,
 176724,
 178048,
 179122,
 179782,
 180093,
 180094,
 180095,
 180097,
 180127,
 3,
 4,
 5,
 279,
 280,
 281,
 282,
 283,
 284,
 285,
 286,
 287,
 288,
 289,
 290,
 352,
 35

In [243]:
nogeo_df48_s.a.value_counts()

P2046    21751
P1082    21446
P2044    16418
P569     15932
P571     12178
         ...  
P2855      123
P1198      122
P1359      115
P8477      110
P8476      110
Name: a, Length: 89, dtype: int64

In [None]:
list(set(idx_all))

In [48]:
var_extracted = nogeo_df48_s.iloc[list(set(idx_all)),:]

In [52]:
var_extracted = var_extracted.reset_index(drop=True)

In [None]:
nogeo_df48_s.to_csv('/projekte/tcl/tclext/kgc/LiterallyWikidata/files_needed/nogeo_df48',index=False)
#var_extracted.to_csv('/projekte/tcl/tclext/kgc/LiterallyWikidata/files_needed/nogeo_df48_ver02',index=False)
.to_csv('/projekte/tcl/tclext/kgc/LiterallyWikidata/files_needed/nogeo_df48_ver02',index=False)

In [53]:
var_extracted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153093 entries, 0 to 153092
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   e         153093 non-null  object 
 1   a         153093 non-null  object 
 2   v         153093 non-null  float32
 3   name_e    153093 non-null  object 
 4   name_a    153093 non-null  object 
 5   ent_type  153093 non-null  object 
dtypes: float32(1), object(5)
memory usage: 6.4+ MB


## Rescale decimal values

In [245]:
#標準常態係數
from sklearn.preprocessing import StandardScaler
def attvalue2scale(df):
    scaler = StandardScaler()
    alist=list(df['a'].unique())
    for a_name in alist:
        attdf = df[df["a"]==a_name]
        scaler.fit(attdf.v.to_numpy().reshape(-1,1))
        result = scaler.transform(df.v.to_numpy().reshape(-1,1))
        return result


In [246]:
x = attvalue2scale(nogeo_df48_s)

In [247]:
x

array([[-0.03070842],
       [-0.03104233],
       [ 1.411756  ],
       ...,
       [-0.03099373],
       [-0.03099493],
       [-0.03099876]], dtype=float32)

In [249]:
nogeo_df48_s['std_v']=x

In [270]:
nogeo_df48_s.name_a.value_counts()[:10]

area                                     21751
population                               21446
elevation above sea level                16418
date of birth                            15932
inception                                12178
mass                                      8909
height                                    7784
work period (start)                       7049
number of matches played/races/starts     5138
date of death                             5049
Name: name_a, dtype: int64

In [252]:
nogeo_df48_s[nogeo_df48_s.std_v==6.287763e+22]

Unnamed: 0,e,a,v,name_e,name_a,ent_type,std_v
79571,Q525,P2067,1.98855e+30,Sun,mass,Q5864,6.287763e+22


In [120]:
maxoutlier= var_extracted[var_extracted['a']==var_extracted['a'][2]].v > var_extracted[var_extracted['a']==var_extracted['a'][2]].v.mean() + 2 * var_extracted[var_extracted['a']==var_extracted['a'][2]].v.std()
var_extracted.iloc[maxoutlier.index,:]

Unnamed: 0,e,a,v,name_e,name_a,ent_type,std_v
2,Q1000056,P2046,45630000.0,Sušice,area,Q7841907,1.411756
5,Q1000138,P2046,14260000.0,Cantenac,area,Q484170,0.419842
8,Q100013,P2046,21270000.0,Brembilla,area,Q1134686,0.641497
11,Q100015,P2046,12110000.0,Brignano Gera d'Adda,area,Q747074,0.351859
14,Q100016,P2046,8140000.0,Brumano,area,Q747074,0.226328
...,...,...,...,...,...,...,...
518,Q100775,P2046,57360000.0,Calestano,area,Q747074,1.782657
521,Q100779,P2046,58830000.0,Collecchio,area,Q747074,1.829138
524,Q100782,P2046,48410000.0,Colorno,area,Q747074,1.499659
527,Q100785,P2046,37530000.0,Compiano,area,Q747074,1.155635


In [None]:
#minmax method
def attvalue2scale(df):
  alist=list(df['a'].unique())
  for a_name in alist:
    attdf = df[df["a"]==a_name]
    for i in range(len(attdf)):
      result = (df.v - attdf.v.min())/ (attdf.v.max()- attdf.v.min())
      return result


In [254]:
len(nogeo_df48_s.a.unique())

89

## Save the cleaning data 存檔

In [126]:
nogeo_df48_s.to_csv('/projekte/tcl/tclext/kgc/LiterallyWikidata/files_needed/nogeo_df48_a89',index=False)


In [None]:
#outlier not finish

## Split into train, valid and test sets


In [None]:
from sklearn.model_selection import train_test_split
train_attri_data, valid_attri_data = train_test_split(df48_s, test_size=0.2,stratify=df48_s['a'])
#train_attri_data, valid_attri_data = train_test_split(df, test_size=0.2)
valid_attri_data, test_attri_data = train_test_split(valid_attri_data, test_size=0.5,stratify=valid_attri_data['a'])
#valid_attri_data, test_attri_data = train_test_split(valid_attri_data, test_size=0.5)

In [None]:
print(f'training triples of df48: {len(train_attri_data)}, valid triples of df48: {len(valid_attri_data)}, test triples of df48: {len(test_attri_data)}')

In [None]:
len(df48_s)

In [None]:
valid_attri_data

In [None]:
test_attri_data.a.value_counts()

In [None]:
train_attri_data.a.value_counts()

In [None]:
train_attri_data.to_csv(f'train_attri_data.csv',index=False)
valid_attri_data.to_csv(f'valid_attri_data.csv',index=False)
test_attri_data.to_csv(f'test_attri_data.csv',index=False)

In [None]:
df_train_att= pd.read_csv(f'train_attri_data.csv')

In [None]:
df_train_att['e'].unique()

In [None]:
df_train_rel= pd.read_csv(f'train.txt',sep='\t',names=['s','p','o'])
df_test_rel= pd.read_csv(f'test.txt',sep='\t',names=['s','p','o'])
df_valid_rel= pd.read_csv(f'valid.txt',sep='\t',names=['s','p','o'])

In [None]:
df_rel_all = pd.concat([df_train_rel,df_valid_rel,df_test_rel],axis=0)

In [None]:
#e_train_list = list(df_train_rel.s.unique())
e_valid_list = list(df_valid_rel.s.unique())
e_test_list = list(df_test_rel.s.unique())

In [None]:
valid_attri_data[valid_attri_data.e.isin(e_valid_list)==True]

In [None]:
## all data index {ent:idx,rel:idx,att:idx}
entities = pd.read_csv('', sep='\t', names=['label', 'name'])
relations = pd.read_csv( 'Relations/relation_labels_en.txt', sep='\t', names=['label', 'name'])
attributes = attri_data.a.value_counts().index
dict_ent_2_idx = dict(zip(entities['label'], np.arange(0, len(entities), 1)))
dict_rel_2_idx = dict(zip(relations['label'], np.arange(0, len(relations), 1)))
dict_att_2_idx = dict(zip(attributes, np.arange(0, len(attributes), 1)))

## Dict contains all Graph objects
dict_all_2_idx = {}
dict_all_2_idx.update(self.dict_ent_2_idx)
dict_all_2_idx.update(self.dict_rel_2_idx)
dict_all_2_idx.update(self.dict_att_2_idx)



In [None]:
dict_e2rv = dict()
for el in attri_data.values:
    #r = self.dict_att_2_idx[el[1]]
    attri = dict_att_2_idx[el[1]]
    v = round(el[2],5)
    e = dict_ent_2_idx[el[0]]
    if e in dict_e2rv:
        l = dict_e2rv[e]
        l.append([attri,v])
        dict_e2rv[e] = l
    else:
        dict_e2rv[e] = [[attri,v]]

In [3]:
df = pd.read_csv('files_needed/nogeo_df48_var')

In [6]:
filter
df.name_a.value_counts()

area                                 21751
population                           21446
elevation above sea level            16085
date of birth                        15932
mass                                  8622
                                     ...  
apoapsis                                 1
orbital period                           1
orbital inclination                      1
number of points/goals/set scored        1
publication date                         1
Name: name_a, Length: 81, dtype: int64