In [1]:
import pandas as pd
from pathlib import Path
import os
import numpy as np
import re
from collections import Counter
import stanza

# Funzioni

In [2]:
def lexical_analysis(df): 
    nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment,pos,lemma', use_gpu=True, verbose=False)
    rows_list = []
    for row in df.itertuples():
        doc = nlp(str(row.reviewText))
        for sentence in doc.sentences:
            sentiment = sentno_to_word(int(sentence.sentiment))
            for word in sentence.words:
                dict_ = {}
                dict_.update({'item_id': str(row.itemID)})
                dict_.update({'review_id': str(row.itemID) + str(row.reviewerID)}) 
                dict_.update({'word': str(word.lemma)}) 
                dict_.update({'sentiment': sentiment})
                dict_.update({'upos': str(word.upos)})
                dict_.update({'xpos': str(word.xpos)})
                rows_list.append(dict_)
    sentiment_df = pd.DataFrame(rows_list)
    return sentiment_df

def sentno_to_word(number):
    switcher = {
        0: 'negative',
        1: 'neutral',
        2: 'positive'
    }
    sentiment_string = (switcher.get(number))
    return sentiment_string

In [3]:
def is_stopword(word, stopword_list):
    if word in stopword_list:
        return True
    elif re.fullmatch(r'[a-zA-Z0-9]{1}[a-zA-Z\- ]*[a-zA-Z0-9]{1}', word) == None:
        return True
    else:
        return False

In [4]:
def extract_unigrams(df):
    df = df.copy()

    #words coming from negative and neutral sentiment sentences removal
    df.drop(df[ ~( df['sentiment'] == 'positive' ) ].index, inplace=True)

    #filtering based on xpos
    df.drop(df[ ~( (df['xpos'] == 'JJ') | (df['upos'] == 'NOUN') ) ].index, inplace=True) 

    #stopwords removal
    with open(Path("stopwords/unigrams_stopwords.txt"), 'r') as file:
        stopword_list = [line.rstrip('\n') for line in file]
    df['stop'] = df.word.map(lambda x: is_stopword(x, stopword_list))
    df.drop(df[(df['stop'] == True)].index, inplace=True)
    del df['stop']
    
    # Join unigrams into a list and return
    df = df.groupby('item_id', as_index=False, sort=False)[['word']].agg(lambda x: ','.join(x)) # group unigrams by item
    df.columns = ['item_id', 'unigrams']
    df['unigrams'] = df['unigrams'].str.lower()

    return df

In [5]:
def extract_bigrams(df):
    df = df.copy()
    # Filter bigrams using pos-tags
    df = pd.concat([df, df.shift(-1).add_prefix('next_')], axis=1) #create bigrams
    df.drop(df[df['review_id'] != df['next_review_id']].index, inplace=True) #delete rows with bigrams from different reviews
    df.drop(df[ (df['sentiment'] != 'positive') | (df['next_sentiment'] != 'positive') ].index, inplace=True)

    #filtering based on xpos
    df.drop(df[~( ( (df['xpos']  == 'JJ') & (df['next_upos']  == 'NOUN') ) | \
                  ( (df['upos']  == 'NN') & (df['next_upos']  == 'NOUN') ) )].index, inplace=True)
    
    #unigram stopword removal
    with open(Path("stopwords/bigrams_single_stopwords.txt"), 'r') as file:
      stopword_list = [line.rstrip('\n') for line in file]
    for stopword in stopword_list:
      df.drop(df[((df['word'] == stopword) | (df['next_word'] == stopword))].index, inplace=True)
    
    #Create bigrams
    df['bigrams'] = df['word'] + ' ' + df['next_word']

    #Remove bigrams stopwords
    with open(Path("stopwords/bigrams_stopwords.txt"), 'r') as file:
        stopword_list = [line.rstrip('\n') for line in file]
    df['stop'] = df['bigrams'].map(lambda x: is_stopword(x, stopword_list))
    df.drop(df[(df['stop'] == True)].index, inplace=True)
    del df['stop']

    #Aggregate bigrams by item
    df = df.groupby('item_id', as_index=False, sort=False)[['bigrams']].agg(lambda x: ','.join(x))
    df['bigrams'] = df['bigrams'].str.lower()

    return df

In [6]:
def tf(df, mode):
    '''
    Returns a Dataframe containing the term frequency for each token/bigram aggregated by item.

    Args:
        df: The DataFrame containing unigrams or bigrams, it must be the output of extract_unigrams or extract_bigrams.
        mode: String that specifies if working with unigrams or bigrams, it must be 'unigrams' or 'bigrams'.

    Returns:
        tf_concat: The Dataframe with the term frequency for each token/bigram.
        
    Raises:
        Exception: If mode not specified.
    '''
    
    df = df.copy()
    
    if not mode:
         raise Exception("You must set a mode, available modes are 'unigrams' and 'bigrams'") 

    tf_concat = pd.DataFrame()
    for row in df.itertuples():
        if mode == 'unigrams':
            wordlist = row.unigrams.split(',')
        else:
            wordlist = row.bigrams.split(',')
        dictionary = wordListToFreqDict(wordlist)
        sorted_dict = sortFreqDict(dictionary)
        tf_df = pd.DataFrame(sorted_dict, columns=['tf', 'term'])
        tf_df['item_id'] = str(row.item_id)
        tf_concat = pd.concat([tf_concat, tf_df])
    tf_concat.sort_values(by='item_id', inplace=True, ascending=False)
    tf_concat.reset_index(drop=True, inplace=True)
    return tf_concat

def wordListToFreqDict(wordlist):
    '''
    Returns a frequency dictionary for the wordlist.

    Args:
        wordlist: The list of terms.

    Returns:
        dict1: The frequency dictionary.
    '''
    
    doc_dim = len(wordlist)
    wordfreq = [wordlist.count(p)/doc_dim for p in wordlist]
    dict_ = dict(list(zip(wordlist,wordfreq)))
    return dict_

def sortFreqDict(freqdict):
    '''
    Returns the sorted frequency dictionary.

    Args:
        freqdict: The frequence dictionary to sort.

    Returns:
        aux: The sorted frequency dictionary.
    '''
    
    aux = [(freqdict[key], key) for key in freqdict]
    aux.sort()
    aux.reverse()
    return aux

In [7]:
def total_tfidf(item_ids, unigrams, bigrams, unigrams_tf, bigrams_tf):
  '''
  Calculates the tf-idf.

  Args:
      item_ids: list of item ids to consider during tf-idf calculation
      unigrams: dataframe with all the extracted unigrams, it can be obtained by joining together all partial dataframes
      bigrams: dataframe with all the extracted bigrams, it can be obtained by joining together all partial dataframes
      unigrams_tf: dataframe with the unigrams term frequency, it can be obtained by joining together all partial dataframes
      bigrams_tf: dataframe with the bigrams term frequency, it can be obtained by joining together all partial dataframes

  '''
  items_number = len(item_ids)

  all_items_path = 'all_items'
  if not os.path.exists(all_items_path):
    os.makedirs(all_items_path)

  # unigrams document frequency
  unigrams_df = df(unigrams, 'unigrams')
  path = os.path.join(all_items_path, 'unigrams_df.txt')
  unigrams_df.to_csv(path, sep="|", header=['term', 'df'], index=None)

  # unigrams tf-idf for each item
  unigrams_tfidf = tf_idf(unigrams_tf, unigrams_df, items_number) # calculate unigrams tfidf
  path = os.path.join(all_items_path, 'unigrams_tfidf.txt')
  unigrams_tfidf.to_csv(path, sep="|", header=['term', 'item_id', 'tf_idf'], index=None)

  # Bigrams document frequency
  bigrams_df = df(bigrams, 'bigrams')
  path = os.path.join(all_items_path, 'bigrams_df.txt')
  bigrams_df.to_csv(path, sep="|", header=['term', 'df'], index=None)
    
  # Bigrams tf-idf for each item
  bigrams_tfidf = tf_idf(bigrams_tf, bigrams_df, items_number)
  path = os.path.join(all_items_path, 'bigrams_tfidf.txt')
  bigrams_tfidf.to_csv(path, sep="|", header=['term', 'item_id', 'tf_idf'], index=None)
  
  return

def df(df, mode):
    '''
    Returns a Dataframe containing the document frequency for each token/bigram.

    Args:
        df: The DataFrame containing unigrams or bigrams, it must be the output of extract_unigrams or extract_bigrams.
        mode: String that specifies if working with unigrams or bigrams, it must be 'unigrams' or 'bigrams'.

    Returns:
        item_df: The Dataframe with the document frequency for each token/bigram.
        
    Raises:
        Exception: If mode not specified.
    '''
    
    if not mode:
         raise Exception("You must set a mode, available modes are 'unigrams' and 'bigrams'") 
    
    df = df.copy()

    df_dict = {}
    for row in df.itertuples():
        if mode == 'unigrams':
            items = row.unigrams.split(',')
        else:
            items = row.bigrams.split(',')
        for w in items:
            try:
                df_dict[w].add(row.item_id)
            except:
                df_dict[w] = {row.item_id}
    for i in df_dict:
        df_dict[i] = len(df_dict[i])
    item_df = pd.DataFrame.from_dict(df_dict, orient='index').reset_index()
    item_df.columns = np.arange(len(item_df.columns))
    item_df.columns = ['term', 'df']
    item_df.sort_values(by='df', inplace=True, ascending=False)
    item_df = item_df.reset_index()
    del item_df['index']
    return item_df

def tf_idf(tf, df, doc_number):
    '''
    Returns a Dataframe containing the tf-idf for each token/bigram.

    Args:
        tf: The DataFrame containing term frequency, it must be the output of tf.
        df: The DataFrame containing document frequency, it must be the output of df.
        doc_number: The number of documents for idf calculation, it is equal to the number of items analyzed

    Returns:
        tf_idf_df: The Dataframe with the tf-idf score for each token/bigram.
    '''
    
    tf = tf.copy()
    tf['tf'] = tf['tf'].astype(float)
    tf['term'] = tf['term'].astype(str)
    
    df = df.copy()
    df['df'] = df['df'].astype(int)
    df['term'] = df['term'].astype(str)
    
    tf_idf_df = tf.merge(df, on="term")
    tf_idf_df['tf_idf'] = tf_idf_df['tf'] * np.log10(doc_number/tf_idf_df['df'])
    del tf_idf_df['tf']
    del tf_idf_df['df']
    tf_idf_df.sort_values(by='tf_idf', inplace=True, ascending=False)
    return tf_idf_df

# Elaborazione dataset

In [8]:
#amazon_items_reviews = pd.read_csv('dataset.csv')
#annotated_df = lexical_analysis(amazon_items_reviews)

# Estrazione aspetti

In [9]:
# Read the annotated dataset
annotated_df = pd.read_csv('annotated_dataset_big.csv')

In [10]:
item_ids = annotated_df['item_id'].unique()

all_items_path = 'all_items'
if not os.path.exists(all_items_path):
    os.makedirs(all_items_path)
    
unigrams = 'unigrams'
if not os.path.exists('unigrams'):
    os.makedirs('unigrams')

bigrams = 'bigrams'
if not os.path.exists('bigrams'):
    os.makedirs('bigrams')

# Unigrams extraction
unigrams = extract_unigrams(annotated_df)
unigrams.to_csv(os.path.join(all_items_path, 'unigrams' + '.txt'), sep="|", header=['item_id', 'unigrams'], index=False)
unigrams_tf = tf(unigrams, 'unigrams')
unigrams_tf.to_csv(os.path.join(all_items_path, 'unigrams_tf' + '.txt'), sep="|", header=['tf', 'term', 'item_id'], index=False)

# Bigrams extraction
bigrams = extract_bigrams(annotated_df)
bigrams.to_csv(os.path.join(all_items_path, 'bigrams' + '.txt'), sep="|", header=['item_id', 'bigrams'], index=False)
bigrams_tf = tf(bigrams, 'bigrams')
bigrams_tf.to_csv(os.path.join(all_items_path, 'bigrams_tf' + '.txt'), sep="|", header=['tf', 'term', 'item_id'], index=False)

# tf-idf
total_tfidf(item_ids, unigrams, bigrams, unigrams_tf, bigrams_tf)

In [11]:
unigrams

Unnamed: 0,item_id,unigrams
0,Q7305024,"combination,story,era,vivid,rich,bitter,comple..."
1,Q5460091,"course,attractive,hostage,man,admiration,help,..."
2,Q2609885,"young,thief,endearing,engaging,level,nice,spel..."
3,Q1764445,"previous,series,informative,interesting,story,..."
4,Q731626,"thougth,provoking,phenominal,tact,wit,heart,mi..."
...,...,...
1484,Q7970705,"gift,friend"
1485,Q3882176,"glad,collection"
1486,Q29384930,"excellent,overview,major,change,sound,picture,..."
1487,Q7225574,"edition,unforgettable"


In [12]:
# Generation of aspects file for each book
unigrams_tfidf = pd.read_csv(os.path.join(all_items_path, 'unigrams_tfidf.txt'), sep='|')
bigrams_tfidf = pd.read_csv(os.path.join(all_items_path, 'bigrams_tfidf.txt'), sep='|')

for id in item_ids:
    item_tfidf = unigrams_tfidf[unigrams_tfidf['item_id'] == str(id)]
    item_tfidf.to_csv(os.path.join('unigrams', str(id) + '.txt'), sep='|', columns=['term', 'tf_idf'], index=None)

    item_tfidf = bigrams_tfidf[bigrams_tfidf['item_id'] == str(id)]
    item_tfidf.to_csv(os.path.join('bigrams', str(id) + '.txt'), sep='|', columns=['term', 'tf_idf'], index=None)

# Selezione aspetti

In [13]:
K = 50

In [14]:
# Select and save unigrams topK
unigrams_path = 'unigrams/'
unigrams_topK_path = 'unigrams_top'+ str(K) + '/'
if not os.path.exists(unigrams_topK_path):
    os.makedirs(unigrams_topK_path)
unigrams_list = os.listdir(unigrams_path)

for book_aspects in unigrams_list:
    df = pd.read_csv(unigrams_path+book_aspects, sep='|')
    top50_df = df.head(K)
    top50_df.to_csv( (unigrams_topK_path + book_aspects), sep='|', index = None)


# Select and save bigrams topK
bigrams_path = 'bigrams/'
bigrams_topK_path = 'bigrams_top'+ str(K) + '/'
if not os.path.exists(bigrams_topK_path):
    os.makedirs(bigrams_topK_path)
bigrams_list = os.listdir(bigrams_path)

for book_aspects in bigrams_list:
    df = pd.read_csv(bigrams_path+book_aspects, sep='|')
    top50_df = df.head(K)
    top50_df.to_csv( (bigrams_topK_path + book_aspects), sep='|', index = None)

# Modifica bigrammi positivi

In [15]:
bigrams_path = 'bigrams_top'+ str(K) + '/'
bigrams_list = os.listdir(bigrams_path)
rows_list = []

with open('stopwords/positive_adjectives.txt', 'r') as file:
    temp = file.read().splitlines()
    pos_adjs = [adj for adj in temp]

for book_aspects in bigrams_list:
    df = pd.read_csv(bigrams_path+book_aspects, sep='|')
    for row in df.itertuples():
        for adj in pos_adjs:
            if row.term.startswith(adj):
                unigram = row.term.replace(adj+' ','')             
                df.drop(df[df['term'] == row.term].index, inplace=True)
                unigram_row = {}
                unigram_row.update({'item_id': book_aspects.replace('.txt', '')})
                unigram_row.update({'term': unigram})
                unigram_row.update({'tf_idf': row.tf_idf})
                rows_list.append(unigram_row)
    df.to_csv((bigrams_path + book_aspects), sep='|', index = None)

additional_unigrams = pd.DataFrame(rows_list)


In [16]:
#stopwords removal
with open(Path("stopwords/unigrams_stopwords.txt"), 'r') as file:
    stopword_list = [line.rstrip('\n') for line in file]
additional_unigrams['stop'] = additional_unigrams.term.map(lambda x: is_stopword(x, stopword_list))
additional_unigrams.drop(additional_unigrams[(additional_unigrams['stop'] == True)].index, inplace=True)
del additional_unigrams['stop']

unigrams_path = 'unigrams_top'+ str(K) + '/'

for row in additional_unigrams.itertuples():
    item_id = row.item_id
    unigram = row.term
    item_unigrams = pd.read_csv(unigrams_path + item_id + '.txt', sep="|")
    u = item_unigrams['term'].tolist()
    if unigram not in u:
        with open(unigrams_path + item_id + '.txt', 'a') as file:
            file.write(f'{unigram}|{row.tf_idf}\r')
            file.close()

In [17]:
additional_unigrams

Unnamed: 0,item_id,term,tf_idf
0,Q10263869,trance,0.021193
1,Q10263869,snapshot,0.021193
4,Q1027487,example,0.010438
5,Q1027487,layla,0.009405
6,Q1027487,beautifull imagination,0.009405
...,...,...,...
7009,Q964269,anti-hero,0.027643
7010,Q978135,kerouac,0.127159
7011,Q978135,creator,0.115118
7012,Q978135,musician,0.115118


# Creazione file popolazione KB

In [18]:
output_path = 'output/'

In [19]:
annotated_df = pd.read_csv('annotated_dataset_big.csv')
item_ids = annotated_df['item_id'].unique()
u_topK_path = 'unigrams_top'+ str(K) + '/'
b_topK_path = 'bigrams_top'+ str(K) + '/'

In [20]:
all_items_unigrams = []
no_unigrams_item_ids = []

for id in item_ids:
    try:
        with open(os.path.join(u_topK_path, id + '.txt'), 'r') as file:
            item_unigrams = [re.sub(r'[|]{1}.*?[\n]{1}','', aspect) for aspect in file.readlines()[1:]]
            all_items_unigrams.extend([id, unigram] for unigram in item_unigrams)
    except FileNotFoundError:
        no_unigrams_item_ids.append(id)

unigrams_df = pd.DataFrame(all_items_unigrams, columns=['item_id', 'aspect'])
unigrams_ids = unigrams_df.drop(labels='item_id', axis=1)
unigrams_ids.drop_duplicates(inplace=True)
unigrams_ids.reset_index(drop=True, inplace=True)
unigrams_ids.insert(0, 'id', [id for id in range(1, len(unigrams_ids) + 1, 1)])

In [21]:
all_items_bigrams = []
no_bigrams_item_ids = []

for id in item_ids:
    try:
        with open(os.path.join(b_topK_path, id + '.txt'), 'r') as file:
            item_bigrams = [re.sub(r'[|]{1}.*?[\n]{1}','', aspect) for aspect in file.readlines()[1:]]
            all_items_bigrams.extend([id, bigram] for bigram in item_bigrams)
    except FileNotFoundError:
        no_bigrams_item_ids.append(id)

bigrams_df = pd.DataFrame(all_items_bigrams, columns=['item_id', 'aspect'])
bigrams_ids = bigrams_df.drop(labels='item_id', axis=1)
bigrams_ids.drop_duplicates(inplace=True)
bigrams_ids.reset_index(drop=True, inplace=True)
bigrams_ids.insert(0, 'id', [id for id in range(1_000_000, len(bigrams_ids) + 1_000_000, 1)])

In [22]:
aspects_df = pd.concat([unigrams_df, bigrams_df])
aspects_ids = pd.concat([unigrams_ids, bigrams_ids])
aspects_mapping = aspects_df.merge(aspects_ids, on='aspect')
aspects_mapping.drop(labels='aspect', axis=1, inplace=True)
aspects_mapping['property_type'] = 'review'
aspects_mapping.reindex(columns=['item_id', 'property_type', 'id']).to_csv(os.path.join(all_items_path, 'aspects_mapping.txt'), sep='|', index=False, header=False)
aspects_ids.to_csv(os.path.join(all_items_path, 'aspects_list.txt'), sep='|', index=False, header=False)

# Taglio aspetti poco connessi

In [23]:
def aspect_drop(row, threshold, counter):
    return True if counter[row['property_id']] < threshold else False

In [24]:
# Elimina gli aspetti con un numero di connessioni inferiore a threshold
THRESHOLD = 10

In [25]:
subj_props = pd.read_csv(os.path.join(all_items_path, 'aspects_mapping.txt'), header=None, sep='|', names=['entity_id', 'property_type', 'property_id'])
books_per_aspect = Counter(subj_props['property_id'])

subj_props['drop'] = subj_props.apply(lambda row: aspect_drop(row, THRESHOLD, books_per_aspect), axis=1)
subj_props.drop(subj_props[subj_props['drop'] == True].index, inplace=True)

books_per_aspect = Counter(subj_props['property_id'])
aspects_per_book = Counter(subj_props['entity_id'])

In [26]:
subj_props.drop(columns=['drop'], inplace=True)
aspects_ids = pd.read_csv(os.path.join(all_items_path, 'aspects_list.txt'), sep='|', names=['property_id', 'property'])
aspects_ids = aspects_ids.merge(subj_props, on='property_id', how='inner')
aspects_ids.drop(columns=['entity_id', 'property_type'], inplace=True)
aspects_ids.drop_duplicates(inplace=True)

In [27]:
subj_props.to_csv(os.path.join(output_path, 'aspects_mapping.txt'), sep='|', index=False, header=False)
aspects_ids.to_csv(os.path.join(output_path, 'aspects_list.txt'), sep='|', index=False, header=False)

# Statistiche aspetti estratti

In [28]:
books_per_aspect = Counter(subj_props['property_id'])
aspects_per_book = Counter(subj_props['entity_id'])

In [29]:
print('count is the number of aspects, statistics are books per aspect:')
pd.Series(books_per_aspect).describe()

count is the number of aspects, statistics are books per aspect:


count    1455.000000
mean       24.155326
std        29.537085
min        10.000000
25%        12.000000
50%        16.000000
75%        25.500000
max       618.000000
dtype: float64

In [30]:
print('count is the number of books, statistics are aspects per book:')
pd.Series(aspects_per_book).describe()

count is the number of books, statistics are aspects per book:


count    1485.000000
mean       23.667340
std        10.955917
min         1.000000
25%        16.000000
50%        23.000000
75%        31.000000
max        62.000000
dtype: float64

# Creazione file Entity Recognizer

## Wikidata + Aspetti

In [31]:
er_path = 'entity_recognizer/'

In [32]:
o_fuzzy = pd.read_csv(os.path.join(er_path, 'entitiesfuzzy.train'), header=None, names=['property_id', 'property', 'threshold', 'item'])
o_regex = pd.read_csv(os.path.join(er_path, 'entitiesregex.train'), sep='\t', header=None, names=['property', 'item'])
o_gazette = pd.read_csv(os.path.join(er_path, 'gazette.txt'), sep='\t', header=None, names=['item', 'property'])

In [33]:
fuzzy = aspects_ids.copy()
fuzzy['threshold'] = 80
fuzzy['item'] = 'item'
fuzzy = pd.concat([o_fuzzy, fuzzy])
fuzzy.to_csv(os.path.join(output_path, 'entitiesfuzzy.train'), sep=',', header=False, index=False)

In [34]:
regex = aspects_ids.copy()
regex.drop('property_id', axis=1, inplace=True)
regex['item'] = 'item'
regex = pd.concat([o_regex, regex])
regex.to_csv(os.path.join(output_path, 'entitiesregex.train'), header=False, index=False, sep='\t')

In [35]:
gazette = aspects_ids.copy()
gazette.drop('property_id', axis=1, inplace=True)
gazette['item'] = 'item'
gazette = gazette.reindex(columns=['item', 'property'])
gazette = pd.concat([o_gazette, gazette])
gazette.to_csv(os.path.join(output_path, 'gazette.txt'), sep='\t', header=False, index=False)

## Solo aspetti

In [36]:
a_fuzzy = pd.read_csv(os.path.join(er_path, 'entitiesfuzzy_aspects_only.train'), header=None, names=['property_id', 'property', 'threshold', 'item'])
a_regex = pd.read_csv(os.path.join(er_path, 'entitiesregex_aspects_only.train'), sep='\t', header=None, names=['property', 'item'])
a_gazette = pd.read_csv(os.path.join(er_path, 'gazette_aspects_only.txt'), sep='\t', header=None, names=['item', 'property'])

In [37]:
fuzzy = aspects_ids.copy()
fuzzy['threshold'] = 80
fuzzy['item'] = 'item'
fuzzy = pd.concat([a_fuzzy, fuzzy])
fuzzy.to_csv(os.path.join(output_path, 'entitiesfuzzy_aspects_only.train'), sep=',', header=False, index=False)

In [38]:
regex = aspects_ids.copy()
regex.drop('property_id', axis=1, inplace=True)
regex['item'] = 'item'
regex = pd.concat([a_regex, regex])
regex.to_csv(os.path.join(output_path, 'entitiesregex_aspects_only.train'), header=False, index=False, sep='\t')

In [39]:
gazette = aspects_ids.copy()
gazette.drop('property_id', axis=1, inplace=True)
gazette['item'] = 'item'
gazette = gazette.reindex(columns=['item', 'property'])
gazette = pd.concat([a_gazette, gazette])
gazette.to_csv(os.path.join(output_path, 'gazette_aspects_only.txt'), sep='\t', header=False, index=False)

# Creazione Dataset per Algoritmi Content-Based

In [40]:
cb_path = 'content_based/'

In [41]:
books_dataset = pd.read_csv(os.path.join(cb_path, 'books_info.csv'))
aspects_dataset = pd.read_csv(os.path.join(output_path, 'aspects_mapping.txt'), sep='|', names=['ID', 'type', 'asp'])

rows_list = []
for row in books_dataset.itertuples():
    new_row = {}
    book_asps = aspects_dataset.drop(aspects_dataset[aspects_dataset['ID'] != str(row.ID)].index)
    book_asps = book_asps['asp']
    book_asps = book_asps.to_list()
    str_book_asps = [str(aspect) for aspect in book_asps]
    new_row.update({'ID' : row.ID})
    new_row.update({'Title' : row.Title})
    new_row.update({'Description' : row.Description})
    new_row.update({'Subjects' : row.Subjects})
    new_row.update({'Authors' : row.Authors})
    new_row.update({'Genres' : row.Genres})
    new_row.update({'Aspects' : str_book_asps})
    rows_list.append(new_row)

df = pd.DataFrame(rows_list)
df.to_csv(os.path.join(output_path, 'books_info.csv'), index=False)