In [96]:
import pandas as pd

### Data M1LM

In [97]:
# Data
m1lm_movies = pd.read_csv('../datasets/ml1m/mappings/product_mappings.txt', sep='\t')
m1lm_entities = pd.read_csv('../datasets/ml1m/joint-kg/kg/e_map.dat', sep='\t', header=None, names=['entity_id', 'uri'])
m1lm_relations = pd.read_csv('../datasets/ml1m/joint-kg/kg/r_map.dat', sep='\t', header=None, names=['relation_id', 'uri'])

m1lm_ratings = pd.read_csv('../datasets/ml1m/ratings.dat', sep='::', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

m1lm_test = pd.read_csv('../datasets/ml1m/joint-kg/kg/test.dat', sep='\t', header=None, names=['subject', 'object', 'predict'])
m1lm_train = pd.read_csv('../datasets/ml1m/joint-kg/kg/train.dat', sep='\t', header=None, names=['subject', 'object', 'predict'])
m1lm_valid = pd.read_csv('../datasets/ml1m/joint-kg/kg/valid.dat', sep='\t', header=None, names=['subject', 'object', 'predict'])

  return func(*args, **kwargs)


In [119]:
total_triples = m1lm_test.shape[0] + m1lm_train.shape[0] + m1lm_valid.shape[0]

# Values prior preprocessing
print("Number of movies in ML1M: ", m1lm_movies.shape[0])
print("Total triples in ML1M: ", total_triples)
print("Percentaje of triples in test: ", m1lm_test.shape[0] / total_triples * 100)
print("Percentaje of triples in train: ", m1lm_train.shape[0] / total_triples * 100)
print("Percentaje of triples in valid: ", m1lm_valid.shape[0] / total_triples * 100)

Number of movies in ML1M:  3265
Total triples in ML1M:  434189
Percentaje of triples in test:  19.999355119544717
Percentaje of triples in train:  70.00062184901046
Percentaje of triples in valid:  10.000023031444831


In [106]:
# Group m1lm_ratings by item id and count the number of ratings
item_ratings = m1lm_ratings.groupby('item_id').size().reset_index(name='ratings_count')
item_ratings

#Join with m1lm_movies by ml1m_id
m1lm_rating = m1lm_movies.set_index('ml1m_id')
m1lm_rating = m1lm_movies.join(item_ratings, how='left')
m1lm_rating = m1lm_rating[["kg_id", "ratings_count"]]
m1lm_rating = m1lm_rating.set_index('kg_id')

m1lm_rating

Unnamed: 0_level_0,ratings_count
kg_id,Unnamed: 1_level_1
0,2077
1,701
2,478
3,170
4,296
...,...
3260,2
3261,320
3262,12
3263,59


In [111]:
# Movies relations
def get_item_relations(file, relation_name):
    with open(file, "r") as file:
        data = [line.strip().split() for line in file]

    data = [[int(x) for x in row] for row in data]

    df = pd.DataFrame({f"{relation_name}": data})

    return df

belong_to_m_ca = get_item_relations("../datasets/ml1m/relations/belong_to_m_ca.txt", "belong_to_m_ca")
cinematography_m_ci = get_item_relations("../datasets/ml1m/relations/cinematography_m_ci.txt", "cinematography_m_ci")
composed_by_m_c = get_item_relations("../datasets/ml1m/relations/composed_by_m_c.txt", "composed_by_m_c")
directed_by_m_d = get_item_relations("../datasets/ml1m/relations/directed_by_m_d.txt", "directed_by_m_d")
edited_by_m_ed = get_item_relations("../datasets/ml1m/relations/edited_by_m_ed.txt", "edited_by_m_ed")
produced_by_company_m_pc = get_item_relations("../datasets/ml1m/relations/produced_by_company_m_pc.txt", "produced_by_company_m_pc")
produced_by_producer_m_pr = get_item_relations("../datasets/ml1m/relations/produced_by_producer_m_pr.txt", "produced_by_producer_m_pr")
starring_m_a = get_item_relations("../datasets/ml1m/relations/starring_m_a.txt", "starring_m_a")
wrote_by_m_w = get_item_relations("../datasets/ml1m/relations/wrote_by_m_w.txt", "wrote_by_m_w")

m1lm_movies_relations = pd.concat([belong_to_m_ca, cinematography_m_ci, composed_by_m_c, directed_by_m_d, edited_by_m_ed, produced_by_company_m_pc, produced_by_producer_m_pr, starring_m_a, wrote_by_m_w], axis=1)

# Set index as id of m1lm_movies_relations
m1lm_movies_relations = m1lm_movies_relations.set_index(m1lm_movies_relations.index)

# Remove the row 3265
m1lm_movies_relations = m1lm_movies_relations.drop(3265)

# Join with m1lm_rating by kg_id
m1lm_movies_relations = m1lm_movies_relations.join(m1lm_rating, how='left')

m1lm_movies_relations


Unnamed: 0,belong_to_m_ca,cinematography_m_ci,composed_by_m_c,directed_by_m_d,edited_by_m_ed,produced_by_company_m_pc,produced_by_producer_m_pr,starring_m_a,wrote_by_m_w,ratings_count
0,"[544, 585, 607, 160, 361, 566, 135, 293, 147, ...",[195],[],[485],[],"[35, 241]","[169, 334]","[1571, 374, 2419, 2426, 1217, 798]",[124],2077
1,"[439, 462, 147, 89, 30, 332, 31, 299, 609, 191...",[8],[],[337],[44],[148],[101],"[1212, 1131, 105, 1146, 321, 1114]",[600],701
2,"[283, 379, 75, 360, 258, 607, 272, 57, 243, 37...",[],[],[358],[189],[43],"[386, 575]","[840, 804, 330, 1332]",[582],478
3,"[160, 533, 607, 31, 85, 16, 360, 68, 299, 539,...",[28],[],[265],[116],[137],[376],"[762, 2375, 2114, 1953, 1038]","[290, 381]",170
4,"[607, 372, 57, 523, 240, 286, 272, 71, 292, 85...",[96],[],[394],"[174, 53]","[54, 148]",[415],"[76, 1971, 1247, 125, 912, 1978, 1270, 2057, 5...","[535, 439]",296
...,...,...,...,...,...,...,...,...,...,...
3260,"[587, 462, 395, 89, 140, 14, 30, 272, 299, 57,...",[],[],[222],[],[115],[],[448],[250],2
3261,"[89, 16, 57, 73, 395, 299, 462, 243, 360, 286,...",[],[],[],[],[],[],[],[],320
3262,"[605, 525, 412, 395, 599, 291, 89, 334, 261, 2...",[158],[],[22],[154],[254],[],[],[],12
3263,"[283, 30, 147, 286, 372, 573, 188, 57, 243, 15...",[158],[],[85],[188],"[43, 54]","[91, 102]",[1997],[],59


In [115]:
# create total_relations as the sum of the length of belong_to_m_ca, cinematography_m_ci, composed_by_m_c, directed_by_m_d, edited_by_m_ed, produced_by_company_m_pc, produced_by_producer_m_pr, starring_m_a, wrote_by_m_w
total_relations = (
    belong_to_m_ca['belong_to_m_ca'].apply(len) +
    cinematography_m_ci['cinematography_m_ci'].apply(len) +
    composed_by_m_c['composed_by_m_c'].apply(len) +
    directed_by_m_d['directed_by_m_d'].apply(len) +
    edited_by_m_ed['edited_by_m_ed'].apply(len) +
    produced_by_company_m_pc['produced_by_company_m_pc'].apply(len) +
    produced_by_producer_m_pr['produced_by_producer_m_pr'].apply(len) +
    starring_m_a['starring_m_a'].apply(len) +
    wrote_by_m_w['wrote_by_m_w'].apply(len) +
    m1lm_movies_relations['ratings_count']
)

# add total_relations as a new column to m1lm_movies_relations
m1lm_movies_relations['total_relations'] = total_relations

# Return movies with 200 or more relations
len(m1lm_movies_relations[m1lm_movies_relations['total_relations'] >= 200])

1451

In [113]:
m1lm_movies_relations

Unnamed: 0,belong_to_m_ca,cinematography_m_ci,composed_by_m_c,directed_by_m_d,edited_by_m_ed,produced_by_company_m_pc,produced_by_producer_m_pr,starring_m_a,wrote_by_m_w,ratings_count,total_relations
0,"[544, 585, 607, 160, 361, 566, 135, 293, 147, ...",[195],[],[485],[],"[35, 241]","[169, 334]","[1571, 374, 2419, 2426, 1217, 798]",[124],2077,2136.0
1,"[439, 462, 147, 89, 30, 332, 31, 299, 609, 191...",[8],[],[337],[44],[148],[101],"[1212, 1131, 105, 1146, 321, 1114]",[600],701,742.0
2,"[283, 379, 75, 360, 258, 607, 272, 57, 243, 37...",[],[],[358],[189],[43],"[386, 575]","[840, 804, 330, 1332]",[582],478,523.0
3,"[160, 533, 607, 31, 85, 16, 360, 68, 299, 539,...",[28],[],[265],[116],[137],[376],"[762, 2375, 2114, 1953, 1038]","[290, 381]",170,210.0
4,"[607, 372, 57, 523, 240, 286, 272, 71, 292, 85...",[96],[],[394],"[174, 53]","[54, 148]",[415],"[76, 1971, 1247, 125, 912, 1978, 1270, 2057, 5...","[535, 439]",296,353.0
...,...,...,...,...,...,...,...,...,...,...,...
3260,"[587, 462, 395, 89, 140, 14, 30, 272, 299, 57,...",[],[],[222],[],[115],[],[448],[250],2,36.0
3261,"[89, 16, 57, 73, 395, 299, 462, 243, 360, 286,...",[],[],[],[],[],[],[],[],320,342.0
3262,"[605, 525, 412, 395, 599, 291, 89, 334, 261, 2...",[158],[],[22],[154],[254],[],[],[],12,48.0
3263,"[283, 30, 147, 286, 372, 573, 188, 57, 243, 15...",[158],[],[85],[188],"[43, 54]","[91, 102]",[1997],[],59,97.0


In [123]:
# Knowledge Graph m1lm
print("Number of entities in m1lm: ", m1lm_entities.shape[0])
print("Number of relations in m1lm: ", m1lm_movies_relations['total_relations'].sum())
print("Number of triplets in m1lm: ", total_triples)

# Relations include belong_to_m_ca, cinematography_m_ci, etc. Plus ratings
print("Number of Relations Types in m1lm: ", 10)

# Sum all total_relations in m1lm_movies_relations

Number of entities in m1lm:  14708
Number of relations in m1lm:  1027159.0
Number of triplets in m1lm:  434189
Number of Relations Types in m1lm:  20


### Data LASTFM