# Data preprocessing 資料前處理

In [1]:
import pandas as pd
import numpy as np

# From LiterallyWikidata read data and check data

In [97]:
column_name = ["e","a","v"]
df48_ori = pd.read_csv("/projekte/tcl/tclext/kgc_chu/LiterallyWikidata/LitWD48K/numeric_literals.txt",sep="\t",names=column_name)

In [52]:
df48_ori[:5]

Unnamed: 0,e,a,v
0,Q1000056,P1082_Q199,+11032^^<http://www.w3.org/2001/XMLSchema#deci...
1,Q1000056,P2044_Q11573,+472^^<http://www.w3.org/2001/XMLSchema#decimal>
2,Q1000056,P2046_Q25343,+45630000.00^^<http://www.w3.org/2001/XMLSchem...
3,Q1000138,P1082_Q199,+1375^^<http://www.w3.org/2001/XMLSchema#decimal>
4,Q1000138,P2044_Q11573,+1^^<http://www.w3.org/2001/XMLSchema#decimal>


In [98]:
df48 = df48_ori

In [99]:
#取V值
df48[['v','xml']] = df48['v'].str.split('^', 1, expand=True)

In [5]:
# check the number of data
print(f'triple number of df48: {len(df48)}, num of df48_entity: {len(df48["e"].unique())}, num of df48_attribute: {len(df48["a"].unique())}')

triple number of df48: 324418, num of df48_entity: 47998, num of df48_attribute: 291


In [7]:
df48.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324418 entries, 0 to 324417
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   e       324418 non-null  object
 1   a       324418 non-null  object
 2   v       324418 non-null  object
dtypes: object(3)
memory usage: 7.4+ MB


# Discovering some issues within data

## 3 value types

In [10]:
#有三種數值型態
df48['xml'].value_counts()

^<http://www.w3.org/2001/XMLSchema#decimal>     148707
^<http://www.w3.org/2001/XMLSchema#double>      118078
^<http://www.w3.org/2001/XMLSchema#dateTime>     57633
Name: xml, dtype: int64

## Time (year-month-date)

In [11]:
# value type: time
df48[df48['xml']=='^<http://www.w3.org/2001/XMLSchema#dateTime>'][:5]

Unnamed: 0,e,a,v,xml
148707,Q1000051,P569,1884-11-05T00:00:00Z,^<http://www.w3.org/2001/XMLSchema#dateTime>
148708,Q1000051,P570,1962-12-01T00:00:00Z,^<http://www.w3.org/2001/XMLSchema#dateTime>
148709,Q1000056,P1249,1233-01-01T00:00:00Z,^<http://www.w3.org/2001/XMLSchema#dateTime>
148710,Q1000138,P576,2016-12-31T00:00:00Z,^<http://www.w3.org/2001/XMLSchema#dateTime>
148711,Q1000217,P571,1893-01-01T00:00:00Z,^<http://www.w3.org/2001/XMLSchema#dateTime>


## Double: coordinate location

In [12]:
# value type: double (geographic)
# coordinate location in original data is mixed. P625，P1335...(coordinate location)有經緯度
df48[df48["a"].str.contains("P1335")][:5]

Unnamed: 0,e,a,v,xml
206346,Q100015,P1335_Latitude,45.5439988,^<http://www.w3.org/2001/XMLSchema#double>
206351,Q100016,P1335_Latitude,45.8495584,^<http://www.w3.org/2001/XMLSchema#double>
206356,Q100018,P1335_Latitude,45.6671782,^<http://www.w3.org/2001/XMLSchema#double>
206362,Q100036,P1335_Latitude,44.7203568,^<http://www.w3.org/2001/XMLSchema#double>
206367,Q100038,P1335_Latitude,44.7499868,^<http://www.w3.org/2001/XMLSchema#double>


## Decimal

In [None]:
# value type: decimal
# same entity has multiple values 可能同entity在P1087有多個值
df48[df48["a"].str.contains("P1087_Q199")][:5]

In [None]:
# P2299 has 2 units 有兩種QXXX
df48[df48["a"].str.contains("P2299")].a.unique()

In [None]:
# some are meanful zero value but some are not
df_vzero=df48[df48["v"]=='0'].sort_values(by='a')

In [None]:
df_vzero[:5]

In [None]:
print(f"there are total {len(df_vzero)} data which has 0.0 in the value column")
print(f"there are total {len(df_vzero['a'].unique())} data which has 0.0 in the value column")

In [None]:
df48[df48["a"].str.contains("P1082")].v

# Dealing with issues 處理以上問題: 
DataTime dtype: keep year data 時間單位 西元年 \\
Remove dulplicated data (same e,a but diff v)
Turn into value dtype to float 
所有變數的值成為可計算的值(有意義可讀取的值) \\
Rescale the values within the same attributes 同attribute value rescale \\
Drop out unreasonal 0 value 去0 \\
Drop out the unit (QXXX)去單位 \\
Keep Attribyes at least number of 15 data 每個屬性最少有15筆資料 \\


## value type: time

In [100]:
# 如果"v"是時間，取前四位數年代部分，np.where(condition, x,y)在condition下，把x取代y
df48["new_v"] = df48["v"].map(lambda x: x[:4])
df48.loc[:,"v"] = np.where((df48["xml"].str.contains("dateTime")), df48["new_v"], df48["v"])

In [101]:
df48["v"]=df48["v"].astype('float32')

## value type: double

In [None]:
# remove double data
#neogeo = df48[~df48['xml'].str.contains('double')]

## value type: decimal

In [102]:
#取有150data的att #同時解決同p不同q
filter = df48.a.value_counts()
filter_index=filter.index[filter>=150]
df48 = df48[df48['a'].isin(filter_index)]

In [58]:
df48.a.value_counts()

P625_Longtiude    24952
P625_Latitude     24952
P2046_Q25343      24039
P1082_Q199        22299
P2044_Q11573      16734
                  ...  
P2219_Q199          164
P2131_Q4917         163
P4010_Q550207       156
P2299_Q550207       155
P1279_Q199          152
Name: a, Length: 93, dtype: int64

In [103]:
#處理a，去單位
df48[["pre_a",'Q']] = df48["a"].str.split('_', 1,expand=True)
df48.loc[:,"new_a"] = np.where((df48["a"].str.contains("Q")), df48["pre_a"], df48["a"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [104]:
df48 = df48.loc[:,['e','new_a','v']]
df48.columns=['e','a','v']


In [105]:
#處理重複值 drop if [e,a] is the same
df48 = df48.drop_duplicates(subset=['e','a'],keep='last')

In [None]:
#如果要取平均值:處理重複資料
# df48 = df48.groupby(['e', 'a'], as_index=False).mean()

In [106]:
df48.a.value_counts()

P625_Longtiude    24952
P625_Latitude     24952
P2046             21751
P1082             21446
P2044             16418
                  ...  
P1087               135
P8477               110
P8476               110
P3864                53
P1603                 8
Name: a, Length: 93, dtype: int64

In [107]:
filter = df48.a.value_counts()
filter_index=filter.index[filter>=150]
df48 = df48[df48['a'].isin(filter_index)]

In [108]:
#處理0值
df48[df48['v']==0.0]

Unnamed: 0,e,a,v
1551,Q1019532,P2927,0.0
1768,Q1024085,P2927,0.0
2007,Q10268484,P6509,0.0
2052,Q1027483,P6509,0.0
2178,Q1028196,P2927,0.0
...,...,...,...
298291,Q47545,P625_Longtiude,0.0
299412,Q483102,P625_Longtiude,0.0
305557,Q51,P1333_Longtiude,0.0
305558,Q51,P625_Longtiude,0.0


## zero value

In [109]:
# preparing idmap for checking real name
entity_idmap = {}
with open('Entities/entity_labels_en.txt',encoding="utf-8") as f:
    for line in f:
        k, v = line.strip().split('\t')
        entity_idmap[k.strip()] = v.strip()
attri_idmap={}
with open('Attributes/attribute_labels_en.txt',encoding="utf-8") as f:
    for line in f:
        k, v = line.strip().split('\t')
        attri_idmap[k.strip()] = v.strip()
attri_idmap['P625_Latitude']='coordinate location(latitude)'
attri_idmap['P625_Longtiude']='coordinate location(logtitude)'
attri_idmap['P1332_Longtiude']='coordinates of northernmost point'
attri_idmap['P1333_Longtiude']='coordinates of southernmost point'
attri_idmap['P1334_Longtiude']='coordinates of easternmost point'
attri_idmap['P1335_Longtiude']='coordinates of westernmost point'
b
df48.loc[:,'name_e']=df48['e'].map(entity_idmap)
df48.loc[:,'name_a']=df48['a'].map(attri_idmap)

In [110]:
# 刪掉特定列，有11個不合理的0值變數
df48 = df48.drop(df48[(df48['a']=='P6509') & (df48['v']==0.0)].index)#'total goals in career'
df48 = df48.drop(df48[(df48['a']=='P6544') & (df48['v']==0.0)].index)#total points in career 
df48 = df48.drop(df48[(df48['a']=='P6543') & (df48['v']==0.0)].index)#total shots in career
df48 = df48.drop(df48[(df48['a']=='P6546') & (df48['v']==0.0)].index)#penalty minutes in career
df48 = df48.drop(df48[(df48['a']=='P6545') & (df48['v']==0.0)].index)#total assists in career 
df48 = df48.drop(df48[(df48['a']=='P1100') & (df48['v']==0.0)].index)#attendance 
df48 = df48.drop(df48[(df48['a']=='P1279') & (df48['v']==0.0)].index)##inflation rate
df48 = df48.drop(df48[(df48['a']=='P2121') & (df48['v']==0.0)].index)#prize money
df48 = df48.drop(df48[(df48['a']=='P3872') & (df48['v']==0.0)].index)#惠顧人數patronage

In [111]:
df48.reset_index(inplace=True)


In [112]:
df48= df48.drop('index',axis=1)
df48.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296457 entries, 0 to 296456
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   e       296457 non-null  object 
 1   a       296457 non-null  object 
 2   v       296457 non-null  float32
 3   name_e  296457 non-null  object 
 4   name_a  262436 non-null  object 
dtypes: float32(1), object(4)
memory usage: 10.2+ MB


In [None]:
# 縮小domain

In [113]:
filter = df48.a.value_counts()
filter_index=filter.index[filter>=150]
df48 = df48[df48['a'].isin(filter_index)]

In [114]:
print(f'triple number of df48: {len(df48)}, num of df48_entity: {len(df48["e"].unique())}, num of df48_attribute: {len(df48["a"].unique())}')

triple number of df48: 296308, num of df48_entity: 47939, num of df48_attribute: 86


In [115]:
ent_typemap = {}
with open('Entities/entity_types.txt',encoding="utf-8") as f:
    for line in f:
        k, v = line.strip().split('\t')
        ent_typemap[k.strip()] = v.strip()

In [116]:
df48['ent_type'] = df48['e'].map(ent_typemap)

In [117]:
df48 = df48.loc[:,["e","a","v","name_e","name_a","ent_type"]]

In [146]:
print(f'triple number of df48: {len(df48)}, num of df48_entity: {len(df48["e"].unique())}, num of df48_attribute: {len(df48["a"].unique())}')

triple number of df48: 296308, num of df48_entity: 47939, num of df48_attribute: 86


In [119]:
df48.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 296308 entries, 0 to 296456
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   e         296308 non-null  object 
 1   a         296308 non-null  object 
 2   v         296308 non-null  float32
 3   name_e    296308 non-null  object 
 4   name_a    262287 non-null  object 
 5   ent_type  296308 non-null  object 
dtypes: float32(1), object(5)
memory usage: 14.7+ MB


In [120]:
df48['name_a']=df48['name_a'].astype('string')

In [78]:
df48_p=df48

In [131]:
row_idx = []
def gather_data(name_att,df):
    df = df.reset_index(drop=True)
    ents = list(df[df['name_a'].str.contains(name_att,na=False)].e.unique())
    for ent in ents:
        list_ent=list(df[df['e']==ent].index)
        row_idx.extend(list_ent)
    return row_idx

In [45]:
ent_idx=gather_data('population',df48_p)

In [132]:
ent_idx2=gather_data('GDP',df48_p)

In [133]:
df48_p.iloc[ent_idx2,:]

Unnamed: 0,e,a,v,name_e,name_a,ent_type
147,Q1000,P1081,7.020000e-01,Gabon,Human Development Index,Q3624078
148,Q1000,P1082,2.025137e+06,Gabon,population,Q3624078
150,Q1000,P2046,2.676670e+08,Gabon,area,Q3624078
151,Q1000,P2131,1.462288e+10,Gabon,nominal GDP,Q3624078
152,Q1000,P2132,7.413000e+03,Gabon,nominal GDP per capita,Q3624078
...,...,...,...,...,...,...
237095,Q986,P1335_Latitude,1.511000e+01,Eritrea,,Q7270
237096,Q986,P625_Latitude,1.548333e+01,Eritrea,coordinate location(latitude),Q7270
296067,Q986,P1332_Longtiude,3.857000e+01,Eritrea,coordinates of northernmost point,Q7270
296068,Q986,P1335_Longtiude,3.665750e+01,Eritrea,coordinates of westernmost point,Q7270


In [134]:
gdp_related_att = df48.iloc[list(set(ent_idx2)),:]

In [140]:
gdp_related_att=gdp_related_att.loc[:,['e','a','v']]

In [141]:
gdp_related_att.to_csv('files_needed/gdp_related.txt',index=False)

# type specific

In [None]:
row_idx = []
def gather_data(name_att,df):
    df = df.reset_index(drop=True)
    a_type = list(df[df['name_a'].str.contains(name_att)].ent_type.unique())
    for type in a_type:
        list_type=list(df[df['ent_type']==type].index)
        row_idx.extend(list_type)
    return row_idx

In [None]:
idx= gather_data("population",df48)
pop_related_att = df48.iloc[idx,:]
pop_related_att.reset_index(inplace=True)
pop_related_att=pop_related_att.drop('index',axis=1)

In [None]:
pop_related_att.to_csv('files_needed/pop_related_att',index=False)

In [None]:
df48[df48.name_a.str.contains("work")]

In [None]:
idx2= gather_data("date of birth",df48)
people_related_att = df48.iloc[idx2,:]


In [None]:
var_name = ["population","GDP (PPP)","PPP GDP per capita",
      "date of birth","date of death", "life expectancy",
      "total revenue","net profit",
      "area",
      "retirement age","age of majority","work period (start)","work period (end)"]

In [None]:
people_related_att.name_a.value_counts()

In [17]:
pop_related_att = pd.read_csv('files_needed/pop_related_att')

In [137]:
pop_related_att

Unnamed: 0,e,a,v,name_e,name_a,ent_type,std_v
0,Q1000056,P1082,11032.00,Sušice,population,Q7841907,-0.030708
1,Q1000056,P2044,472.00,Sušice,elevation above sea level,Q7841907,-0.031042
2,Q1000056,P2046,45630000.00,Sušice,area,Q7841907,1.411756
3,Q1010144,P1082,5419.00,Hluboká nad Vltavou,population,Q7841907,-0.030886
4,Q1010144,P2044,394.00,Hluboká nad Vltavou,elevation above sea level,Q7841907,-0.031045
...,...,...,...,...,...,...,...
149557,Q48438,P569,280.00,Saint George,date of birth,Q21070568,-0.031048
149558,Q48438,P570,303.00,Saint George,date of death,Q21070568,-0.031048
149559,Q217533,P2048,1.85,Joker,height,Q6498903,-0.031057
149560,Q217533,P2067,75.00,Joker,mass,Q6498903,-0.031055


In [None]:
idx_all = []
for var in var_name:
    idx_all = gather_data(var,df48)


In [None]:
idx_all

In [None]:
df48.a.value_counts()

In [None]:
list(set(idx_all))

In [None]:
var_extracted = df48.iloc[list(set(idx_all)),:]

In [None]:
var_extracted = var_extracted.reset_index(drop=True)

In [None]:
var_extracted.info()

## Rescale decimal values

In [142]:
#標準常態係數
from sklearn.preprocessing import StandardScaler
def attvalue2scale(df):
    scaler = StandardScaler()
    alist=list(df['a'].unique())
    for a_name in alist:
        attdf = df[df["a"]==a_name]
        scaler.fit(attdf.v.to_numpy().reshape(-1,1))
        result = scaler.transform(df.v.to_numpy().reshape(-1,1))
        return result


In [147]:
x = attvalue2scale(df48)

In [None]:
x

In [148]:
df48['std_v']=x

In [149]:
df48.a.value_counts()[:10]

P625_Longtiude     24952
P625_Latitude      24952
P2046              21751
P1082              21446
P2044              16418
P569               15932
P571               12178
P2067               8909
P1332_Longtiude     8537
P1332_Latitude      8537
Name: a, dtype: int64

In [None]:
df48[df48.std_v==6.287763e+22]

In [None]:
maxoutlier= df48[df48['a']=='P2046'].v > df48[df48['a']=='P2046'].v.mean() + 2 * df48[df48['a']=='P2046'].v.std()                          

In [None]:
df48.iloc[maxoutlier.index,:]

In [None]:
df48.loc[125347,'v']=4.239700e+8

In [155]:
#minmax method
def attvalue2maxmin(df):
  alist=list(df['a'].unique())
  for a_name in alist:
    attdf = df[df["a"]==a_name]
    for i in range(len(attdf)):
      result = (df.v - attdf.v.min())/ (attdf.v.max()- attdf.v.min())
      return result


In [156]:
df48.loc[:,'minmax_v']=attvalue2maxmin(df48)

In [159]:
df48=df48.drop('index',axis=1)

In [151]:
df48 = df48.reset_index()
df48.drop('index',axis=1)

Unnamed: 0,e,a,v,name_e,name_a,ent_type,std_v
0,Q1000056,P1082,1.103200e+04,Sušice,population,Q7841907,-0.030708
1,Q1000056,P2044,4.720000e+02,Sušice,elevation above sea level,Q7841907,-0.031042
2,Q1000056,P2046,4.563000e+07,Sušice,area,Q7841907,1.411756
3,Q1000138,P1082,1.375000e+03,Cantenac,population,Q484170,-0.031014
4,Q1000138,P2044,1.000000e+00,Cantenac,elevation above sea level,Q484170,-0.031057
...,...,...,...,...,...,...,...
296303,Q99987,P1333_Longtiude,9.586108e+00,Brembate di Sopra,coordinates of southernmost point,Q747074,-0.031057
296304,Q99987,P1334_Longtiude,9.595788e+00,Brembate di Sopra,coordinates of easternmost point,Q747074,-0.031057
296305,Q99987,P1335_Longtiude,9.564391e+00,Brembate di Sopra,coordinates of westernmost point,Q747074,-0.031057
296306,Q99987,P625_Longtiude,9.580647e+00,Brembate di Sopra,coordinate location(logtitude),Q747074,-0.031057


## Save the cleaning data 存檔

In [160]:
#df48 = df48.loc[:,['e','a','v','name_e','name_a','ent_type','std_v']]
df48.to_csv('files_needed/numeric_literals_ver06',index=False)


In [None]:
#outlier not finish

## Split into train, valid and test sets


In [None]:
from sklearn.model_selection import train_test_split
train_attri_data, valid_attri_data = train_test_split(df48, test_size=0.2,stratify=df48['a'])
#train_attri_data, valid_attri_data = train_test_split(df, test_size=0.2)
valid_attri_data, test_attri_data = train_test_split(valid_attri_data, test_size=0.5,stratify=valid_attri_data['a'])
#valid_attri_data, test_attri_data = train_test_split(valid_attri_data, test_size=0.5)

In [None]:
print(f'training triples of df48: {len(train_attri_data)}, valid triples of df48: {len(valid_attri_data)}, test triples of df48: {len(test_attri_data)}')

In [None]:
len(df48)

In [None]:
valid_attri_data

In [None]:
test_attri_data.a.value_counts()

In [None]:
train_attri_data.a.value_counts()

In [None]:
train_attri_data.to_csv(f'train_attri_data.csv',index=False)
valid_attri_data.to_csv(f'valid_attri_data.csv',index=False)
test_attri_data.to_csv(f'test_attri_data.csv',index=False)

In [None]:
df_train_att= pd.read_csv(f'train_attri_data.csv')

In [None]:
df_train_att['e'].unique()

In [None]:
df_train_rel= pd.read_csv(f'train.txt',sep='\t',names=['s','p','o'])
df_test_rel= pd.read_csv(f'test.txt',sep='\t',names=['s','p','o'])
df_valid_rel= pd.read_csv(f'valid.txt',sep='\t',names=['s','p','o'])

In [None]:
df_rel_all = pd.concat([df_train_rel,df_valid_rel,df_test_rel],axis=0)

In [None]:
#e_train_list = list(df_train_rel.s.unique())
e_valid_list = list(df_valid_rel.s.unique())
e_test_list = list(df_test_rel.s.unique())

In [None]:
valid_attri_data[valid_attri_data.e.isin(e_valid_list)==True]

In [190]:
list(df48['name_a'].value_counts().index)[:30]

['coordinate location(logtitude)',
 'coordinate location(latitude)',
 'area',
 'population',
 'elevation above sea level',
 'date of birth',
 'inception',
 'mass',
 'coordinates of northernmost point',
 'coordinates of easternmost point',
 'coordinates of westernmost point',
 'coordinates of southernmost point',
 'height',
 'work period (start)',
 'number of matches played/races/starts',
 'date of death',
 'work period (end)',
 'total goals in career',
 'water as percent of area',
 'penalty minutes in career',
 'total points in career',
 'career plus-minus rating',
 'total shots in career',
 'total assists in career',
 'duration',
 'start time',
 'date of official opening',
 'end time',
 'publication date',
 'length']

In [170]:

indx3 = gather_data('life expectancy',df48)

In [175]:
var_name = ["population","GDP (PPP)","PPP GDP per capita",
      "date of birth","date of death", "life expectancy",
      "total revenue","net profit",
      "area",
      "retirement age","age of majority","work period (start)","work period (end)"]

Unnamed: 0,e,a,v,name_e,name_a,ent_type,std_v,minmax_v


In [209]:
df48.iloc[123821,2]=423970000

In [223]:
df48[df48.name_a=='mass'].describe()

Unnamed: 0,v,std_v,minmax_v
count,8909.0,8909.0,8909.0
mean,2.2342e+26,7.064505e+18,5.365189e+16
std,inf,6.661651e+20,inf
min,0.02,-0.03105725,4.802783e-12
25%,75.0,-0.03105488,1.801044e-08
50%,83.91459,-0.0310546,2.015118e-08
75%,92.98643,-0.03105431,2.232968e-08
max,1.98855e+30,6.287763e+22,4.775287e+20


In [213]:
#rescale area

df48.loc[:,'std_v']=attvalue2scale(df48)

In [165]:
## all data index {ent:idx,rel:idx,att:idx}
entities = pd.read_csv('Entities/entity_labels_en.txt', sep='\t', names=['label', 'name'])
relations = pd.read_csv( 'Relations/relation_labels_en.txt', sep='\t', names=['label', 'name'])
attributes = pd.read_csv( 'files_needed/attribute.txt', names=['label'])
dict_ent_2_idx = dict(zip(entities['label'], np.arange(0, len(entities), 1)))
dict_rel_2_idx = dict(zip(relations['label'], np.arange(0, len(relations), 1)))
dict_att_2_idx = dict(zip(attributes['label'], np.arange(0, len(relations), 1)))

## Dict contains all Graph objects
dict_all_2_idx = {}
dict_all_2_idx.update(dict_ent_2_idx)
dict_all_2_idx.update(dict_rel_2_idx)
dict_all_2_idx.update(dict_att_2_idx)

dict_all_2_idx

In [161]:
dict_e2rv = dict()
for el in attri_data.values:
    #r = self.dict_att_2_idx[el[1]]
    attri = dict_att_2_idx[el[1]]
    v = round(el[2],5)
    e = dict_ent_2_idx[el[0]]
    if e in dict_e2rv:
        l = dict_e2rv[e]
        l.append([attri,v])
        dict_e2rv[e] = l
    else:
        dict_e2rv[e] = [[attri,v]]

NameError: name 'attri_data' is not defined

In [166]:
# make a dict
dict_a2ev = dict()
for el in train_att.values:
    attri = dict_all_2_idx[el[1]]
    v = el[2]
    e = dict_all_2_idx[el[0]]
    if attri in dict_a2ev:
        l = dict_a2ev[attri]
        l.append([e,v])
        dict_a2ev[attri] = l
    else:
        dict_a2ev[attri] = [[e,v]]

NameError: name 'train_att' is not defined

In [None]:
import pickle
with open('/mount/projekte7/tcl/tclext/kgc/LiterallyWikidata/files_needed/dict_a2ev.pickle', 'wb') as fw:
    pickle.dump(dict_a2ev, fw, protocol=pickle.HIGHEST_PROTOCOL)