# Setup

In [None]:
runtime_type = 'hosted' # 'local' if using local runtime, 'hosted' if using Colab runtime
download_resources = True # set to True in order to download required resources, this is needed every time when using a hosted runtime and one time only if running locally

## Connessione a Google Drive

In [None]:
if runtime_type == 'hosted':
  from google.colab import drive 
  drive.mount('/content/drive') # mount Google Drive

## Selezione directory di lavoro

In [None]:
# Google Drive working directory path
if runtime_type == 'hosted':
  %cd /content/drive/My\ Drive/Colab\ Files/ 

# local working directory path
if runtime_type == 'local':
  %cd D:\opimi_test\ 

## Download risorse

In [None]:
if download_resources:
  !pip install stanza
  import stanza
  import nltk
  stanza.download('english')
  nltk.download('wordnet')
  nltk.download('sentiwordnet')

# Funzioni

## Pre-processing

In [None]:
import pandas as pd
import stanza
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn

def sentiment_analysis(df):
    '''
    Returns a DataFrame with the reviews splitted in tokens associated with the sentiment detected for the source sentence.

    Args:
        df: The DataFrame on which to perform the sentiment analysis, it must have movie_id, review_id and review_text columns.

    Returns:
        reviewSentiment: The DataFrame with a sentiment tag for each token.
    '''
    
    df = df.copy()
    df['movie_id'] = df['movie_id'].astype(str)
    df['review_id'] = df['review_id'].astype(str)
    df['review_text'] = df['review_text'].astype(str)
    
    nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')
    rows_list = []
    for row in df.itertuples():
        doc = nlp(str(row.review_text))
        for sentence in doc.sentences:
            sentiment = sentiment_to_word(int(sentence.sentiment))
            for token in sentence.tokens:
                dict1 = {}
                dict1.update({'movie_id': str(row.movie_id)})
                dict1.update({'review_id': str(row.review_id)}) 
                dict1.update({'token': str(token.text)}) 
                dict1.update({'sentiment': sentiment})
                rows_list.append(dict1)
    sentiment_df = pd.DataFrame(rows_list)
    return sentiment_df

def sentiment_to_word(number):
    '''
    Converts the numerical representation of sentiment into a string.

    Args:
        number: Sentiment represented as a number.

    Returns:
        sentiment_string: Sentiment as a string.
    '''
    
    switcher = {
        0: 'negative',
        1: 'neutral',
        2: 'positive'
    }
    sentiment_string = (switcher.get(number, 'Invalid number specified'))
    return sentiment_string

def pos_lemma(df):
    '''
    Returns a DataFrame with each token associated with its universal POS (UPOS) tag, treebank-specific POS (XPOS) tag, 
    universal morphological features (UFeats) and lemma.

    Args:
        df: The DataFrame with the sentences to POS-tag, it must be the output of sentiment_analysis.

    Returns:
        posTagging: The Dataframe with tokens associated to their POS-tags and lemmas.
    '''
    
    df = df.copy()
    df['movie_id'] = df['movie_id'].astype(str)
    df['review_id'] = df['review_id'].astype(str)
    df['token'] = df['token'].astype(str)

    df = df.groupby(['movie_id', 'review_id'], as_index=False, sort=False)[['token']].agg(lambda x: ' '.join(x)) # group tokens by movie and review
    
    nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma', tokenize_pretokenized=True)
    rows_list = []
    for row in df.itertuples():
        doc = nlp(row.token)
        for sent in doc.sentences:
            for word in sent.words:
                dict1 = {}
                dict1.update({'movie_id': str(row.movie_id)})
                dict1.update({'review_id': str(row.review_id)})
                dict1.update({'token': (str(word.lemma))})
                dict1.update({'upos': str(word.upos)})
                dict1.update({'xpos': str(word.xpos)})
                dict1.update({'feats': str(word.feats if word.feats else "_")})
                rows_list.append(dict1)
    pos_lemma_df = pd.DataFrame(rows_list)
    return pos_lemma_df

def calculate_swn_score(df):
    '''
    Tags each token with its positivity, negativity and objectivity score from SentiWordNet

    Args:
        df: The dataframe with the tokens, it must be the output of pos_lemma.

    Returns:
        The dataframe with the tagged tokens.
    '''

    df = df.copy()
    df['token'] = df['token'].astype(str)
    df['xpos'] = df['xpos'].astype(str)
    df['positivity'] = df.apply(lambda x: get_positivity_score(x.token, x.xpos), axis=1)
    df['negativity'] = df.apply(lambda x: get_negativity_score(x.token, x.xpos), axis=1)
    df['objectivity'] = df.apply(lambda x: get_objectivity_score(x.token, x.xpos), axis=1)
    return df

def get_positivity_score(word, xpos_tag):
    '''
    Returns the positivity score for the given word and POS tag.

    Args:
        word: The original word.
        xpos_tag: The treebank-specific POS tag of the word.

    Returns:
        word.pos_score(): The positivity score of the given word.
        0 if the given treebank-specific POS tag can't be mapped to a wordnet POS tag or if can't get a value for the given word.
    '''
    
    pos_tag = xpos_to_wnpos(xpos_tag)
    if pos_tag == None:
        return 0
    arg = str(word) + '.' + str(pos_tag) + '.' + '01'
    try:
        word = swn.senti_synset(arg)
        return word.pos_score()
    except:
        return 0 # return positivity score 0 if can't find the lemma
    
def get_negativity_score(word, xpos_tag):
    '''
    Returns the negativity score for the given word and POS tag.

    Args:
        word: The original word.
        xpos_tag: The treebank-specific POS tag of the word.

    Returns:
        word.pos_score(): The negativity score of the given word.
        0 if the given treebank-specific POS tag can't be mapped to a wordnet POS tag or if can't get a value for the given word.
    '''

    pos_tag = xpos_to_wnpos(xpos_tag)
    if pos_tag == None:
        return 0
    arg = str(word) + '.' + pos_tag + '.' + '01'
    try:
        word = swn.senti_synset(arg)
        return word.neg_score()
    except:
        return 0 # return positivity score 0 if can't find the lemma
    
def get_objectivity_score(word, xpos_tag):
    '''
    Returns the objectivity score for the given word and POS tag.

    Args:
        word: The original word.
        xpos_tag: The treebank-specific POS tag of the word.

    Returns:
        word.pos_score(): The objectivity score of the given word. 
        0 if the given treebank-specific POS tag can't be mapped to a wordnet POS tag or if can't get a value for the given word.
    '''

    pos_tag = xpos_to_wnpos(xpos_tag)
    if pos_tag == None:
        return 0
    arg = str(word) + '.' + pos_tag + '.' + '01'
    try:
        word = swn.senti_synset(arg)
        return word.obj_score()
    except:
        return 0 # return positivity score 0 if can't find the lemma
        
def xpos_to_wnpos(xpos_tag):
    '''
    Returns wordnet POS tag for the given treebank-specific POS tag or None if can't convert it.

    Args:
        xpos_tag: The xpos tag.

    Returns:
        tag_dict.get(): Tag accepted by lemmatize().
    '''

    xpos_tag = xpos_tag[0].upper()
    tag_dict = {"J": wn.ADJ,
                "N": wn.NOUN,
                "V": wn.VERB,
                "R": wn.ADV}

    return tag_dict.get(xpos_tag, None)

def filter_by_sentiment(df, sentiment):
    '''
    Returns the given DataFrame after removing tokens not matching the given sentiment.

    Args:
        df: The original dataframe with tokens associated to the sentiment of the source sentence.
        sentiment: The desired sentiment to keep.

    Returns:
        df: The DataFrame containing only tokens associated to the given sentiment.
    '''

    df = df.copy()
    df['sentiment'] = df['sentiment'].astype(str)
    df.drop(df[(df.sentiment != sentiment)].index, inplace=True) # drop tokens not matching the desired sentiment
    del df['sentiment'] # delete now useless sentiment column
    return df

def lowercase(df):
    '''
    Returns the given DataFrame after lowercasing all tokens.

    Args:
        df: The original dataframe.

    Returns:
        df: The DataFrame containing only lowercased tokens.
    '''

    df = df.copy()
    df['token'] = df['token'].astype(str)
    df['token'] = df['token'].map(lambda x: x.lower())
    return df

## Aspect Extraction

In [None]:
import re
from pathlib import Path
import pandas as pd
import os
import gc

def extract_tokens(df):
    '''
    Returns a DataFrame with filtered tokens. Tokens are lowercased and filtered removing stopwords from nltk, non alphabetic characters, singular and plural proper nouns,
    comparative and superlative adjectives and any other token that is not an adjective or a noun.
    
    Args:
        df: The DataFrame with the pos-tagged tokens to filter, it must be the output of pos_tag.

    Returns:
        tokenization: The Dataframe with a tokens list associated to each movie id.
    '''

    df = df.copy()
    df['movie_id'] = df['movie_id'].astype(str)
    df['review_id'] = df['review_id'].astype(str)
    df['token'] = df['token'].astype(str)
    df['upos'] = df['upos'].astype(str)
    df['xpos'] = df['xpos'].astype(str)
    df['feats'] = df['feats'].astype(str)
    df['positivity'] = df['positivity'].astype(float)
    df['negativity'] = df['negativity'].astype(float)
    df['objectivity'] = df['objectivity'].astype(float)
    
    # Filter tokens using pos-tags
    df.drop(df[(df.upos == 'PUNCT')].index, inplace=True) # drop punctuation
    df.drop(df[(df.upos == 'PROPN')].index, inplace=True) # drop tokens that are proper nouns
    df.drop(df[(df.xpos == 'JJR') | (df.xpos == 'JJS')].index, inplace=True) # drop tokens that are comparative or superlative adjectives
    df.drop(df[~((df.xpos == 'JJ') | (df.upos == 'NOUN'))].index, inplace=True) # drop tokens that are not adjectives or nouns
    df.reset_index(drop=True, inplace=True)
    
    # Clean tokens from special characters and numbers
    df = alphabetize_tokens(df)
    df.drop(df[(df.token == '')].index, inplace=True) # drop empty tokens resulted from alphabetization
    df.reset_index(drop=True, inplace=True)
    
    df.drop(df[(df.objectivity + df.positivity + df.negativity == 0.0)].index, inplace=True)
    
    df.reset_index(drop=True, inplace=True)

    df = remove_stopwords(df, 'token')
    
    # Join tokens into a list and return
    df = df.groupby('movie_id', as_index=False, sort=False)[['token']].agg(lambda x: ' '.join(x)) # group tokens by movie
    rows_list = []
    for row in df.itertuples():
        tokens_list = row.token.split(' ')
        dict1 = {}
        dict1.update({'movie_id': str(row.movie_id)})
        dict1.update({'tokens': tokens_list})
        rows_list.append(dict1)
    df = pd.DataFrame(rows_list)
    return df

def save_tokens(df):
    '''
    Save the extracted tokens into files.

    Args:
        df: The DataFrame with tokens, it must be the output of extract_tokens.
    '''
    
    df = df.copy()
    df['movie_id'] = df['movie_id'].astype(str)
    
    for row in df.itertuples():
        tokens_path = os.path.join(row.movie_id, 'tokens')
        if not os.path.exists(tokens_path):
            os.makedirs(tokens_path)
        tokens_path = os.path.join(tokens_path, 'extracted_tokens.txt')
        with open(tokens_path, 'w', encoding='UTF8') as tokensFile:
            tokens_list = row.tokens
            for token in tokens_list:
                tokensFile.write(token + '\n')

def extract_bigrams(df):
    '''
    Returns a DataFrame with filtered bigrams. Bigrams are filtered removing bigrams containing punctuation, comparative and superlative adjectives, 
    singular and plural proper nouns. Of the remaining bigrams only those matching this pos-tags are extracted: 
    adjective-noun, noun-noun, adjective-verb.
    

    Args:
        df: The DataFrame from which to extract bigrams, it must be the output of pos_tag.

    Returns:
        bigrams_df: The Dataframe with a bigram list associated to each movie.
    '''

    df = df.copy()
    df['movie_id'] = df['movie_id'].astype(str)
    df['review_id'] = df['review_id'].astype(str)
    df['token'] = df['token'].astype(str)
    df['upos'] = df['upos'].astype(str)
    df['xpos'] = df['xpos'].astype(str)
    df['feats'] = df['feats'].astype(str)
    df['positivity'] = df['positivity'].astype(float)
    df['negativity'] = df['negativity'].astype(float)
    df['objectivity'] = df['objectivity'].astype(float)
    
    # delete unused columns to save memory
    del df['feats']

    # Filter bigrams using pos-tags
    bigrams_df = pd.concat([df, df.shift(-1).add_prefix('next_')], axis=1)
    bigrams_df.drop(bigrams_df[bigrams_df.review_id != bigrams_df.next_review_id].index, inplace=True) # delete rows with bigrams from different reviews
    del bigrams_df['review_id'] #delete to save memory
    del bigrams_df['next_review_id'] #delete to save memory
    bigrams_df.drop(bigrams_df[((bigrams_df.upos == 'PUNCT') | (bigrams_df.next_upos == 'PUNCT'))].index, inplace=True) # delete rows with bigrams containing punctuation
    bigrams_df.drop(bigrams_df[(bigrams_df.xpos == 'JJR') | (bigrams_df.next_xpos == 'JJR')].index, inplace=True) # delete rows with bigrams containing comparative adjectives
    bigrams_df.drop(bigrams_df[(bigrams_df.xpos == 'JJS') | (bigrams_df.next_xpos == 'JJS')].index, inplace=True) # delete rows with bigrams containing superlative adjectives
    bigrams_df.drop(bigrams_df[(bigrams_df.upos == 'PROPN') | (bigrams_df.next_upos == 'PROPN')].index, inplace=True) # delete rows with bigrams containing proper nouns
    bigrams_df.drop(bigrams_df[(bigrams_df.token == '') | (bigrams_df.next_token == '')].index, inplace=True) # delete rows with bigrams where one token is empty
    bigrams_df.drop(bigrams_df[~(((bigrams_df.xpos  == 'JJ') & (bigrams_df.next_upos  == 'NOUN')) | \
                                ((bigrams_df.upos  == 'NOUN') & (bigrams_df.next_upos  == 'NOUN')) \
                                )].index, inplace=True) #delete rows with bigrams not matching interesting couples
    bigrams_df.drop(bigrams_df[(bigrams_df.positivity + bigrams_df.negativity + bigrams_df.objectivity == 0) | (bigrams_df.next_positivity + bigrams_df.next_negativity + bigrams_df.next_objectivity == 0)].index, inplace=True)

    stopword_list = []
    stopwords_folder = Path("stopwords/")
    file_path = stopwords_folder / "bigrams_single_stopwords.txt" #qua va messa la lista delle prime e ultime parole che decretano la morte di un bigramma
    with open(file_path, 'r') as file:
      lines = [line.rstrip('\n') for line in file]
    stopword_list.extend(lines)
    for stopword in stopword_list:
      bigrams_df.drop(bigrams_df[((bigrams_df.token == stopword) | (bigrams_df.next_token == stopword))].index, inplace=True)
    
    #Create bigrams
    bigrams_df['bigrams'] = bigrams_df['token'] + ' ' + bigrams_df['next_token']

    #Remove stopwords
    bigrams_df = remove_stopwords(bigrams_df, 'bigram')

    #Aggregate bigrams by movie
    bigrams_df = bigrams_df.groupby('movie_id', as_index=False, sort=False)[['bigrams']].agg(lambda x: ','.join(x))

    return bigrams_df

def save_bigrams(df):
    '''
    Save the bigrams extracted into files.

    Args:
        df: The DataFrame with bigrams, it must be extract_bigrams output.
    '''

    df = df.copy()
    df['movie_id'] = df['movie_id'].astype(str)
    
    for row in df.itertuples():
        bigrams_path = os.path.join(row.movie_id, 'bigrams')
        if not os.path.exists(bigrams_path):
            os.makedirs(bigrams_path)
        bigrams_path = os.path.join(bigrams_path, 'extracted_bigrams.txt')
        with open(bigrams_path, 'w', encoding='UTF8') as bigramsFile:
            for bigram in row.bigrams.split(','):
                bigramsFile.write(bigram + '\n')

def alphabetize_tokens(df):
    '''
    Returns the given dataframe after removing non alphabetic characters from all tokens.

    Args:
        df: The original dataframe.

    Returns:
        df: The DataFrame containing only alphabetic tokens.
    '''

    df = df.copy()
    df['token'] = df['token'].astype(str)
    df['token'] = df['token'].map(lambda x: alphabetize_string(x))
    return df

def alphabetize_string(string):
    '''
    Returns the given string after removing non alphabetic characters.

    Args:
        string: The original string.

    Returns:
        string: The string with non alphabetic characters removed.
    '''

    regex = r"(\w+-\w+)|-+"
    string = re.sub(regex, r"\1", string)
    return string

def remove_stopwords(df, mode):
    '''
    Returns the given df after removing stopwords.

    Args:
        df: The dataframe containing stopwords to remove.
        mode: String that specifies if working with tokens or bigrams, it must be 'tokens' or 'bigrams'.

    Returns:
        df: The dataframe with stopwords removed.
    '''

    df = df.copy()
    
    stopword_list = []
    stopwords_folder = Path("stopwords/")

    if mode == 'token':
      file_path = stopwords_folder / "ranks_nl_stopwords.txt"
      with open(file_path, 'r') as file:
        lines = [line.rstrip('\n') for line in file]
      stopword_list.extend(lines)
    
      file_path = stopwords_folder / "tokens_stopwords.txt"
      with open(file_path, 'r') as file:
        lines = [line.rstrip('\n') for line in file]
      stopword_list.extend(lines)

      file_path = stopwords_folder / "wikidata_tok_stopwords.txt"
      with open(file_path, 'r') as file:
        lines = [line.rstrip('\n') for line in file]
      stopword_list.extend(lines)

      df['token'] = df['token'].astype(str)
      df['stop'] = df['token'].map(lambda x: is_stopword(x, stopword_list))
    else:
      file_path = stopwords_folder / "bigrams_stopwords.txt"
      with open(file_path, 'r') as file:
        lines = [line.rstrip('\n') for line in file]
      stopword_list.extend(lines)

      file_path = stopwords_folder / "wikidata_big_stopwords.txt"
      with open(file_path, 'r') as file:
        lines = [line.rstrip('\n') for line in file]
      stopword_list.extend(lines)

      df['bigrams'] = df['bigrams'].astype(str)
      df['stop'] = df['bigrams'].map(lambda x: is_stopword(x, stopword_list))

    df.drop(df[(df.stop)].index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    del df['stop']
    return df

def is_stopword(string, stopword_list):
    '''
    Returns True if the given string is a stopword, false otherwise.

    Args:
        string: The string to check against stopword lists.
    '''

    if string in stopword_list:
        return True
    else:
        return False

def join_dataframes(start, stop, aspect_type, what):
  '''
  Joins the splitted results of aspect extraction.

  Args:
      start: the first piece to join
      stop: the last piece to join
      aspect_type:  what kind of aspect to join, it can be 'tokens' or 'bigrams'
      what: what to join, it can be 'all_tokens', 'all_bigrams', 'tokens_term_freq' or 'bigrams_term_freq'
  '''

  path = os.path.join('all_movies', str(aspect_type))
  df0 = pd.read_csv(os.path.join(path, str(what) + '_' + str(start) + '.txt'), sep='|')
  for i in range(start + 1, stop + 1):
    df = pd.read_csv(os.path.join(path, str(what) + '_' + str(i) + '.txt'), sep='|')
    df0 = df0.append(df, ignore_index=True)
    del df
    gc.collect()

  df0.to_csv(str(what) + '_dataset.txt', index=None, sep='|')

## Aspect Selection

In [None]:
from ast import literal_eval
import numpy as np
from collections import Counter
import pandas as pd
import os
import gc
from time import strftime, gmtime
from pathlib import Path

def term_freq(df, mode):
    '''
    Returns a Dataframe containing the term frequency for each token/bigram aggregated by movie.

    Args:
        df: The DataFrame containing tokens or bigrams, it must be the output of extract_tokens or extract_bigrams.
        mode: String that specifies if working with tokens or bigrams, it must be 'tokens' or 'bigrams'.

    Returns:
        term_freq_concat: The Dataframe with the term frequency for each token/bigram.
        
    Raises:
        Exception: If mode not specified.
    '''
    
    if not mode:
         raise Exception("You must set a mode, available modes are 'tokens' and 'bigrams'") 
    df = df.copy()
    df['movie_id'] = df['movie_id'].astype(str)
    
    term_freq_concat = pd.DataFrame()
    for row in df.itertuples():
        if mode == 'tokens':
            wordlist = literal_eval(str(row.tokens))
            file_suffix = '_tokens_term_freq.txt'
        else:
            wordlist = row.bigrams.split(',')
            file_suffix = '_bigrams_term_freq.txt'
        dictionary = wordListToFreqDict(wordlist)
        sorted_dict = sortFreqDict(dictionary)
        term_freq_df = pd.DataFrame(sorted_dict, columns=['term_freq', 'term'])
        term_freq_df['movie_id'] = str(row.movie_id)
        term_freq_concat = pd.concat([term_freq_concat, term_freq_df])
    term_freq_concat.sort_values(by='movie_id', inplace=True, ascending=False)
    term_freq_concat.reset_index(drop=True, inplace=True)
    return term_freq_concat

def wordListToFreqDict(wordlist):
    '''
    Returns a frequency dictionary for the wordlist.

    Args:
        wordlist: The list of terms.

    Returns:
        dict1: The frequency dictionary.
    '''
    
    doc_dim = len(wordlist)
    wordfreq = [wordlist.count(p)/doc_dim for p in wordlist]
    dict1 = dict(list(zip(wordlist,wordfreq)))
    return dict1

def sortFreqDict(freqdict):
    '''
    Returns the sorted frequency dictionary.

    Args:
        freqdict: The frequence dictionary to sort.

    Returns:
        aux: The sorted frequency dictionary.
    '''
    
    aux = [(freqdict[key], key) for key in freqdict]
    aux.sort()
    aux.reverse()
    return aux

def doc_freq(df, mode):
    '''
    Returns a Dataframe containing the document frequency for each token/bigram.

    Args:
        df: The DataFrame containing tokens or bigrams, it must be the output of extract_tokens or extract_bigrams.
        mode: String that specifies if working with tokens or bigrams, it must be 'tokens' or 'bigrams'.

    Returns:
        doc_freq_df: The Dataframe with the document frequency for each token/bigram.
        
    Raises:
        Exception: If mode not specified.
    '''
    
    if not mode:
         raise Exception("You must set a mode, available modes are 'tokens' and 'bigrams'") 
    df = df.copy()
    df['movie_id'] = df['movie_id'].astype(str)
    
    doc_freq_dict = {}
    for row in df.itertuples():
        if mode == 'tokens':
            df['tokens'] = df['tokens'].astype(str)
            items = literal_eval(str(row.tokens))
        else:
            items = row.bigrams.split(',')
        for w in items:
            try:
                doc_freq_dict[w].add(row.movie_id)
            except:
                doc_freq_dict[w] = {row.movie_id}
    for i in doc_freq_dict:
        doc_freq_dict[i] = len(doc_freq_dict[i])
    doc_freq_df = pd.DataFrame.from_dict(doc_freq_dict, orient='index').reset_index()
    doc_freq_df.columns = np.arange(len(doc_freq_df.columns))
    doc_freq_df.columns = ['term', 'doc_freq']
    doc_freq_df.sort_values(by='doc_freq', inplace=True, ascending=False)
    doc_freq_df = doc_freq_df.reset_index()
    del doc_freq_df['index']
    return doc_freq_df

def tf_idf(term_freq, doc_freq, doc_number):
    '''
    Returns a Dataframe containing the tf-idf for each token/bigram.

    Args:
        term_freq: The DataFrame containing term frequency, it must be the output of term_freq.
        doc_freq: The DataFrame containing document frequency, it must be the output of doc_freq.
        doc_number: The number of documents for idf calculation, it is equal to the number of movies analyzed

    Returns:
        tfidf_df: The Dataframe with the tf-idf score for each token/bigram.
    '''
    
    term_freq = term_freq.copy()
    term_freq['term_freq'] = term_freq['term_freq'].astype(float)
    term_freq['term'] = term_freq['term'].astype(str)
    
    doc_freq = doc_freq.copy()
    doc_freq['doc_freq'] = doc_freq['doc_freq'].astype(int)
    doc_freq['term'] = doc_freq['term'].astype(str)
    
    tfidf_df = term_freq.merge(doc_freq, on="term")
    tfidf_df['tf_idf'] = tfidf_df['term_freq'] * np.log10(doc_number/tfidf_df['doc_freq'])
    del tfidf_df['term_freq']
    del tfidf_df['doc_freq']
    tfidf_df.sort_values(by='tf_idf', inplace=True, ascending=False)
    return tfidf_df

def df_tfidf(movie_id_list, tokens_df, bigrams_df, tokens_term_freq, bigrams_term_freq, movies_number):
  '''
  Calculates the tf-idf.

  Args:
      movie_id_list: list of movie ids to consider during tf-idf calculation
      tokens_df: dataframe with all the extracted tokens, it can be obtained by joining together all partial dataframes
      bigrams_df: dataframe with all the extracted bigrams, it can be obtained by joining together all partial dataframes
      tokens_term_freq: dataframe with the tokens term frequency, it can be obtained by joining together all partial dataframes
      bigrams_term_freq: dataframe with the bigrams term frequency, it can be obtained by joining together all partial dataframes
      movies_number: number of movies in movie_id_list 

  '''
  
  # Define tokens path
  tokens_path = os.path.join('all_movies', 'tokens')
  if not os.path.exists(tokens_path):
    os.makedirs(tokens_path)
  
  time = strftime("%H:%M:%S", gmtime())
  print(time + ' tokens document frequency... \n')

  # Tokens document frequency
  tokens_doc_freq = doc_freq(tokens_df, 'tokens')
  path = os.path.join(tokens_path, 'tokens_doc_freq.txt')
  tokens_doc_freq.to_csv(path, sep="|", header=['term', 'doc_freq'], index=None)
    
  time = strftime("%H:%M:%S", gmtime())
  print(time + ' tokens tf-idf... \n')    

  # Tokens tf-idf for each movie
  tokens_tfidf = tf_idf(tokens_term_freq, tokens_doc_freq, movies_number) # calculate tokens tfidf
  path = os.path.join(tokens_path, 'tokens_tfidf.txt')
  tokens_tfidf.to_csv(path, sep="|", header=['term', 'movie_id', 'tf_idf'], index=None)
  
  per_movie_freq_list(movie_id_list, tokens_doc_freq, 'tokens', 'doc_freq')
  per_movie_freq_list(movie_id_list, tokens_tfidf, 'tokens', 'tf_idf')
  
  # Define bigrams path
  bigrams_path = os.path.join('all_movies', 'bigrams')
  if not os.path.exists(bigrams_path):
    os.makedirs(bigrams_path)

  time = strftime("%H:%M:%S", gmtime())
  print(time + ' bigrams document frequency... \n')

  # Bigrams document frequency
  bigrams_doc_freq = doc_freq(bigrams_df, 'bigrams')
  path = os.path.join(bigrams_path, 'bigrams_doc_freq.txt')
  bigrams_doc_freq.to_csv(path, sep="|", header=['term', 'doc_freq'], index=None)

  time = strftime("%H:%M:%S", gmtime())
  print(time + ' bigrams tf-idf... \n')  
    
  # Bigrams tf-idf for each movie
  bigrams_tfidf = tf_idf(bigrams_term_freq, bigrams_doc_freq, movies_number)
  path = os.path.join(bigrams_path, 'bigrams_tfidf.txt')
  bigrams_tfidf.to_csv(path, sep="|", header=['term', 'movie_id', 'tf_idf'], index=None)

  per_movie_freq_list(movie_id_list, bigrams_doc_freq, 'bigrams', 'doc_freq')
  per_movie_freq_list(movie_id_list, bigrams_tfidf, 'bigrams', 'tf_idf')
  
  return

def per_movie_freq_list(movie_id_list, df, mode, df_type):
  '''
  Create files with term frequency, document frequency or tf-idf for tokens or bigrams

  Args:
      movie_id_list: list of movie ids to consider creating the list
      df: the dataframe with term frequency, document frequency or tf-idf
      mode: what kind of aspects are in df, it can be 'tokens' or 'bigrams'
      df_type: what kind of dataframe is df, it can be 'term_freq', 'doc_freq' or 'tf-idf'
  '''

  if not df_type:
    raise Exception("You must set a df_type, available df_types are 'term_freq', 'doc_freq' and 'tf_idf") 

  if not mode:
    raise Exception("You must set a mode, available modes are 'tokens' and 'bigrams'") 

  time = strftime("%H:%M:%S", gmtime())
  print(time + ' per movie ' + str(mode) + str(df_type) + '\n')  

  df = df.copy()
  
  i = 0
  for movie_id in movie_id_list:
    i+=1
    path = os.path.join(movie_id, mode)
    if not os.path.exists(path):
      os.makedirs(path)
    if df_type == 'doc_freq':
      try:
        extracted_aspect_terms = pd.read_csv(os.path.join(path, 'extracted_' + str(mode) + '.txt'), sep='\n', names=['term'])
      except OSError as e:
        break
      movie_df = df.merge(extracted_aspect_terms, on = 'term')
      movie_df.drop_duplicates(inplace=True)
      movie_df['movie_id'] = movie_id
    else:
      movie_df = df.loc[df['movie_id'] == movie_id]

    movie_df.sort_values(by=df_type, ascending=False, ignore_index=True, inplace=True)
    del movie_df['movie_id']
    if df_type != 'term_freq':
      movie_df.to_csv(os.path.join(path, str(mode) + '_' + str(df_type) + '.txt'), sep='|', header=['term', str(df_type)], index=None)
    else:
      movie_df.to_csv(os.path.join(path, str(mode) + '_' + str(df_type) + '.txt'), sep='|', header=[str(df_type), 'term'], index=None)
    del movie_df
    gc.collect()
    
  return

def top_k_and_overlapping(movie_id_list, k):
  '''Extract the top k for tokens and bigrams coming from movies in movie_id_list'''
  extract_movie_tok_top_k(movie_id_list, k) # crea la top k per i token
  extract_movie_big_top_k(movie_id_list, k)# crea la top k per i bigrammi

  good_bigrams_to_tok(movie_id_list, k)

  tok_over_df = tokens_overlapping_list(movie_id_list, k) #calcola gli aspetti in overlapping per i token
  big_over_df = bigrams_overlapping_list(movie_id_list, k) #calcola gli aspetti in overlapping per i bigrammi
  tok_over_df.to_csv('tokens_overlapping_list_' + str(k) + '.txt', sep='|', index=None)
  big_over_df.to_csv('bigrams_overlapping_list_' + str(k) + '.txt', sep='|', index=None)

  return

def extract_movie_tok_top_k(id_list, k):
  '''Extract the top k list for tokens by tf-idf for each movie in id_list'''

  for movie_id in id_list:
    tokens_path = os.path.join(movie_id,'tokens')
    tokens_tfidf = pd.read_csv(os.path.join(tokens_path, 'tokens_tf_idf.txt'), sep='|')
    tokens_top = tokens_tfidf.head(k)
    tokens_top.to_csv(os.path.join(tokens_path, 'top' + str(k) + '_tokens.txt'), sep='|', index=None)

def extract_movie_big_top_k(id_list, k):
  '''Extract the top k list for bigrams by tf-idf for each movie in id_list'''

  for movie_id in id_list:
    bigrams_path = os.path.join(movie_id, 'bigrams')
    bigrams_tfidf = pd.read_csv(os.path.join(bigrams_path, 'bigrams_tf_idf.txt'), sep='|')
    bigrams_top = bigrams_tfidf.head(k)
    bigrams_top.to_csv(os.path.join(bigrams_path, 'top' + str(k) + '_bigrams.txt'), sep='|', index=None)

def aspect_term_tok_list(id_list, k):
  '''Concatenates together the top k tokens list for every movie in id_list'''

  concat_df = pd.DataFrame()

  for movie_id in id_list:
    tokens_path = os.path.join(movie_id,'tokens')
    df = pd.read_csv(os.path.join(tokens_path, 'top' + str(k) + '_tokens.txt'), sep='|')
    del df['tf_idf']
    
    concat_df = pd.concat([concat_df, df])
  
  concat_df['term'] = concat_df['term'].astype(str)

  concat_df.drop_duplicates(inplace=True)
  
  concat_df.reset_index(inplace=True, drop=True)
  concat_df['term_id'] = concat_df.index

  cols = list(concat_df.columns)
  a, b = cols.index('term'), cols.index('term_id')
  cols[b], cols[a] = cols[a], cols[b]
  concat_df = concat_df[cols]

  return concat_df

def aspect_term_big_list(id_list, k):
  '''Concatenates together the top k bigrams list for every movie in id_list'''

  concat_df = pd.DataFrame()

  for movie_id in id_list:
    bigrams_path = os.path.join(movie_id,'bigrams')
    df = pd.read_csv(os.path.join(bigrams_path, 'top' + str(k) +'_bigrams.txt'), sep='|')
    del df['tf_idf']

    concat_df = pd.concat([concat_df, df])

  concat_df['term'] = concat_df['term'].astype(str)

  concat_df.drop_duplicates(inplace=True)
  
  concat_df.reset_index(inplace=True, drop=True)
  
  concat_df.index += 100000
  concat_df['term_id'] = concat_df.index

  cols = list(concat_df.columns)
  a, b = cols.index('term'), cols.index('term_id')
  cols[b], cols[a] = cols[a], cols[b]
  concat_df = concat_df[cols]

  return concat_df

def map_aspect_terms(movie_id_list, k, map_tokens=True, map_bigrams=True):
  '''Extract k aspect terms per movie and create the mapping files needed for Converse'''

  if map_tokens:
    time = strftime("%H:%M:%S", gmtime())
    print(time + ' mapping tokens... \n')
    df = aspect_term_tok_list(movie_id_list, k) #concatenates together the top k for every movie in movie_id_list
    tok_list_clean_df = pd.read_csv('tokens_overlapping_list_clean.txt', sep='|')
    df = df.merge(tok_list_clean_df, on='term')
    del df['occurrences']
    df.to_csv('tokens_list.txt', sep='|', index=None, header=None)
    df = tokens_mapping(movie_id_list, k)
    df.to_csv('tokens_mapping.txt', sep='|', index=None, header=None)

  if map_bigrams:
    time = strftime("%H:%M:%S", gmtime())
    print(time + ' mapping bigrams... \n')
    df = aspect_term_big_list(movie_id_list, k) #concatenates together the top k for every movie in movie_id_list
    big_list_clean_df = pd.read_csv('bigrams_overlapping_list_clean.txt', sep='|')
    df = df.merge(big_list_clean_df, on='term')
    del df['occurrences']
    df.to_csv('bigrams_list.txt', sep='|', index=None, header=None)
    df = bigrams_mapping(movie_id_list, k)
    df.to_csv('bigrams_mapping.txt', sep='|', index=None, header=None)

def tokens_mapping(id_list, k):
  '''maps the extracted k tokens to the movies in id_list they come from'''

  aspect_term_df = pd.read_csv('tokens_list.txt', sep='|', names = ['term_id', 'term'])
  aspect_term_df['term_id'] = aspect_term_df['term_id'].astype(int)
  concat_df = pd.DataFrame()

  for movie_id in id_list:
    movie_tokens_path = os.path.join(movie_id,'tokens')

    df = pd.read_csv(os.path.join(movie_tokens_path, 'top' + str(k) + '_tokens.txt'), sep='|') #df with all tokens extracted from the movie
    del df['tf_idf']
    
    df = df.merge(aspect_term_df, on='term')
    
    df['movie_id'] = movie_id
    del df['term']

    concat_df = pd.concat([concat_df, df])
  
  concat_df['property_type'] = 'review'

  cols = list(concat_df.columns)
  a, b, c = cols.index('term_id'), cols.index('movie_id'), cols.index('property_type')
  cols[b], cols[a], cols[c] = cols[c], cols[b], cols[a]
  concat_df = concat_df[cols]

  return concat_df

def bigrams_mapping(id_list, k):
  '''maps the extracted k bigrams to the movies in id_list they come from'''

  aspect_term_df = pd.read_csv('bigrams_list.txt', sep='|', names = ['term_id', 'term'])
  aspect_term_df['term_id'] = aspect_term_df['term_id'].astype(int)
  concat_df = pd.DataFrame()

  for movie_id in id_list:
    movie_bigrams_path = os.path.join(movie_id,'bigrams')

    df = pd.read_csv(os.path.join(movie_bigrams_path, 'top' + str(k) +'_bigrams.txt'), sep='|') #df with all tokens extracted from the movie
    del df['tf_idf']
    
    df = df.merge(aspect_term_df, on='term')
    
    df['movie_id'] = movie_id
    del df['term']

    concat_df = pd.concat([concat_df, df])
  
  concat_df['property_type'] = 'review'

  cols = list(concat_df.columns)
  a, b, c = cols.index('term_id'), cols.index('movie_id'), cols.index('property_type')
  cols[b], cols[a], cols[c] = cols[c], cols[b], cols[a]
  concat_df = concat_df[cols]

  return concat_df

def tokens_overlapping_list(id_list, k):
  '''calculates the number of movies in which each token appears'''

  concat_df = pd.DataFrame()

  for movie_id in id_list:
    tokens_path = os.path.join(movie_id,'tokens')
    df = pd.read_csv(os.path.join(tokens_path, 'top' + str(k) + '_tokens.txt'), sep='|')
    del df['tf_idf']
    
    concat_df = pd.concat([concat_df, df])
  
  concat_df['term'] = concat_df['term'].astype(str)

  concat_df.reset_index()

  concat_df = concat_df.groupby(['term']).size().reset_index(name='occurrences')
  concat_df.sort_values(by='occurrences', inplace=True, ascending=False)

  return concat_df

def bigrams_overlapping_list(id_list, k):
  '''calculates the number of movies in which each bigram appears'''

  concat_df = pd.DataFrame()

  for movie_id in id_list:
    bigrams_path = os.path.join(movie_id,'bigrams')
    df = pd.read_csv(os.path.join(bigrams_path, 'top' + str(k) +'_bigrams.txt'), sep='|')
    del df['tf_idf']

    concat_df = pd.concat([concat_df, df])

  concat_df['term'] = concat_df['term'].astype(str)

  concat_df.reset_index()

  concat_df = concat_df.groupby(['term']).size().reset_index(name='occurrences')
  concat_df.sort_values(by='occurrences', inplace=True, ascending=False)

  return concat_df

def count_aspect_terms_frequency(movie_id_list, k):
  '''calls tokens_overlapping_list and bigrams_overlapping_list and saves results to files'''

  tok_over_df = tokens_overlapping_list(movie_id_list, k)
  big_over_df = bigrams_overlapping_list(movie_id_list, k)
  tok_over_df.to_csv("tok_overlap.txt", sep='|', index=None)
  big_over_df.to_csv("big_overlap.txt", sep='|', index=None)

def remove_good(token):
  '''remove the adjective from positive bigrams'''
  index = token.find(' ') + 1
  return token[index:]

def good_bigrams_to_tok(movie_id_list, k):

  search = 'beautiful decent enjoyable entertaining excellent good great nice worthy superb wonderful amazing' # list of adjectives to eliminate from bigrams

  for movie_id in movie_id_list:
    bigrams_path = os.path.join(movie_id, 'bigrams', 'top' + str(k) + '_bigrams.txt') # define bigrams path
    tokens_path = os.path.join(movie_id, 'tokens', 'top' + str(k) + '_tokens.txt') # define tokens path

    bigrams_df = pd.read_csv(bigrams_path, sep = '|') # read bigrams file

    res = bigrams_df[[any(i in words for i in search.split()) for words in bigrams_df['term'].str.split().values]] # get good/great bigrams
    if not res.empty:
      res['term'] = res['term'].map(lambda x: remove_good(x)) # remove good/great token from those bigrams
      res.rename(columns = {'term':'token'}, inplace = True)
      res = remove_stopwords(res, 'token')
      res.rename(columns = {'token':'term'}, inplace = True)
      res.drop_duplicates(subset='term', inplace=True)

      bigrams_df.drop(bigrams_df[[any(i in words for i in search.split()) for words in bigrams_df['term'].str.split().values]].index, inplace=True) # drop the good/great bigrams form bigrams file
      bigrams_df.reset_index()
      bigrams_df.to_csv(bigrams_path, sep = '|', index = None)

      tokens_df = pd.read_csv(tokens_path, sep = '|') 
      tokens_list = res['term']
      tokens_df['term'] = tokens_df['term'].astype(str)
      tokens_df.drop(tokens_df[[any(i in words for i in tokens_list) for words in tokens_df['term'].str.split().values]].index, inplace=True)
  
      concat_df = pd.concat([tokens_df, res])
      concat_df.reset_index()
      concat_df.to_csv(tokens_path, sep = '|', index = None)
  
def edit_aspects_label(df, mode):
    '''
    Returns the given df after modifying the label of the aspects that have the same name as a movie.

    Args:
        df: The dataframe containing stopwords to remove.
        mode: String that specifies if working with tokens or bigrams, it must be 'token' or 'bigram'.
    '''

    df = df.copy()
    
    movie_names_list = []
    stopwords_folder = Path("stopwords/")

    if mode == 'token':
      file_path = stopwords_folder / "tokens_movie_names_stopwords.txt"
      with open(file_path, 'r') as file:
        lines = [line.rstrip('\n') for line in file]
      movie_names_list.extend(lines)

    else:
      file_path = stopwords_folder / "bigrams_movie_names_stopwords.txt"
      with open(file_path, 'r') as file:
        lines = [line.rstrip('\n') for line in file]
      movie_names_list.extend(lines)

      file_path = stopwords_folder / "wikidata_big_stopwords.txt"
      with open(file_path, 'r') as file:
        lines = [line.rstrip('\n') for line in file]
      movie_names_list.extend(lines)

    df['term'] = df['term'].astype(str)

    df['term'] = df['term'].map(lambda x: edit_label(x, movie_names_list))

    return df

def edit_label(string, movie_names_list):
    '''
    Returns True if the given string is a stopword, false otherwise.

    Args:
        string: The string to check against stopword lists.
        movie_names_list: list of movie names that trigger the editing of the label
    '''
    if string in movie_names_list:
        return string + ' (property)'
    else:
        return string

# Esecuzione
Eseguendo le celle di seguito è possibile estrarre gli aspetti a partire dai vari pezzi di dataset. 
È possibile definire quali pezzi di dataset si vogliono utilizzare modificando i parametri start e stop.

In [None]:
start = 0 # primo pezzo di dataset da elaborare, default = 0
stop = 39 # ultimo pezzo di dataset da elaborare, default = 39

## Pre-processing
Esegue le funzioni di Pre-processing sui vari pezzi di dataset salvando i risultati in altrettanti pezzi. 
*   I file in splitted_sentiment sono i token risultanti da ogni frase associati al sentiment rilevato per la frase di provenienza;
*   I file in splitted_processed_tokens sono i token provenienti dalle frasi con sentiment positivo associati ai relativi PoS tag e sentiment score calcolato da SentiWordNet.

Input di questa fase: pezzi di dataset contenente le recensioni (file in splitted_dataset);

Output di questa fase: pezzi di dataset elaborati (file in splitted_sentiment e splitted_processed_tokens).

In [None]:
sentiment_path = 'splitted_sentiment'
processed_tokens_path = 'splitted_processed_tokens'

for counter in range (start, stop + 1):
  time = strftime("%H:%M:%S", gmtime())
  print(time + ' Pre-processing df' + str(counter) + '\n')
  sample_df = pd.read_csv(os.path.join('splitted_dataset', 'df' + str(counter) + '.txt'), sep='|', header=None, names=['movie_id', 'review_id', 'review_text'])

  sentiment_df = sentiment_analysis(sample_df)
  sentiment_df.to_csv(os.path.join(sentiment_path, 'sentiment' + str(counter) + '.txt'), sep="|", index=None, header=['movie_id', 'review_id', 'token', 'sentiment'])
  sentiment_df = pd.read_csv(os.path.join(sentiment_path, 'sentiment' + str(counter) + '.txt'), sep='|', header=None, names=['movie_id', 'review_id', 'token', 'sentiment'])

  pass1 = filter_by_sentiment(sentiment_df, 'positive')
  pass2 = pos_lemma(pass1)
  pass3 = lowercase(pass2)
  processed_tokens_df = calculate_swn_score(pass3)

  processed_tokens_df.to_csv(os.path.join(processed_tokens_path, 'processed_tokens' + str(counter) + '.txt'), sep='|', index=None, header=['movie_id', 'review_id', 'token', 'upos', 'xpos', 'feats', 'positivity', 'negativity', 'objectivity'])
  
  time = strftime("%H:%M:%S", gmtime())
  print(time + ' Pre-processing df' + str(counter) + ' DONE \n')

time = strftime("%H:%M:%S", gmtime())
print(time + ' Pre-processing COMPLETE \n')
print('===========================\n')

## Aspect Extraction


Esegue le funzioni di aspect extraction partendo dai file in splitted_processed_tokens. Da questa estrazione si ottengono unigrammi, bigrammi e relativa term frequency.

Input di questa fase: pezzi di dataset in splitted_processed_tokens;

Output di questa fase: liste di unigrammi e bigrammi estratti per ogni film e relativa term frequency.

In [None]:
from time import strftime, gmtime
from pathlib import Path

for counter in range (start, stop + 1):
  tokens_extraction = True
  bigrams_extraction = True

  time = strftime("%H:%M:%S", gmtime())
  splitted_df_folder = Path("splitted_processed_tokens/")
  processed_tokens_df_str = ('processed_tokens' + str(counter) + '.txt')
  file_path = splitted_df_folder / processed_tokens_df_str
  print(time + ' Extracting from processed_tokens_df: ' + processed_tokens_df_str + '\n')
  processed_tokens_df = pd.read_csv(file_path, sep='|')
  movies_number = len(processed_tokens_df['movie_id'].value_counts().index) # calculate number of movies in the dataframe, used for document frequency calculation
  time = strftime("%H:%M:%S", gmtime())
  print(time + ' movies_number: ' + str(movies_number) + '\n')
  movie_id_list = processed_tokens_df['movie_id'].unique() # create a list of all the movies id in the considered piece of dataset

  # Tokens extraction
  if tokens_extraction:
    time = strftime("%H:%M:%S", gmtime())
    print(time + ' extracting tokens... \n')
    # Define tokens path
    tokens_path = os.path.join('all_movies', 'tokens')
    if not os.path.exists(tokens_path):
        os.makedirs(tokens_path)
    # tokens extraction
    tokens_df = extract_tokens(processed_tokens_df) # filter and extract tokens
    path = os.path.join(tokens_path, 'all_tokens_' + str(counter) + '.txt') # define path to save all tokens extracted from a file in splitted_processed_tokens
    tokens_df.to_csv(path, sep="|", header=['movie_id', 'tokens'], index=None) # save tokens to file
    save_tokens(tokens_df) # save tokens to files for each movie
    # tokens term frequency for each movie
    tokens_term_freq = term_freq(tokens_df, 'tokens') # calculate term frequency
    path = os.path.join(tokens_path, 'tokens_term_freq_' + str(counter) + '.txt')
    tokens_term_freq.to_csv(path, sep="|", header=['term_freq', 'term', 'movie_id'], index=None)
    time = strftime("%H:%M:%S", gmtime())
    print(time + ' creating tokens lists... \n')
    per_movie_freq_list(movie_id_list, tokens_term_freq, 'tokens', 'term_freq') # save tokens term frequency for each movie in movie_id_list
  
  # Bigrams extraction
  if bigrams_extraction:
    time = strftime("%H:%M:%S", gmtime())
    print(time + ' extracting bigrams... \n')
    # Define bigrams path
    bigrams_path = os.path.join('all_movies', 'bigrams')
    if not os.path.exists(bigrams_path):
        os.makedirs(bigrams_path)
    # Bigrams extraction
    bigrams_df = extract_bigrams(processed_tokens_df) # filter and extract bigrams
    path = os.path.join(bigrams_path, 'all_bigrams_' + str(counter) + '.txt') # define path to save all bigrams extracted from a file in splitted_processed_tokens
    bigrams_df.to_csv(path, sep="|", header=['movie_id', 'bigrams'], index=None) # save bigrams to file 
    save_bigrams(bigrams_df) # save bigrams to files for each movie
    # Bigrams term frequency for each movie
    bigrams_term_freq = term_freq(bigrams_df, 'bigrams')
    path = os.path.join(bigrams_path, 'bigrams_term_freq_' + str(counter) + '.txt')
    bigrams_term_freq.to_csv(path, sep="|", header=['term_freq', 'term', 'movie_id'], index=None)
    time = strftime("%H:%M:%S", gmtime())
    print(time + ' creating bigrams lists... \n')
    per_movie_freq_list(movie_id_list, bigrams_term_freq, 'bigrams', 'term_freq') # save bigrams term frequency for each movie in movie_id_list

    time = strftime("%H:%M:%S", gmtime())
    print(time + ' Extraction from processed_tokens' + str(counter) + ' DONE \n')

time = strftime("%H:%M:%S", gmtime())
print(' Aspect Extraction COMPLETE \n')
print('===========================\n')

Viene eseguita l'unione di tutti i file contenenti unigrammi e bigrammi estratti e relativa term frequency

In [None]:
# join splitted dataframes resulted from aspect extraction
join_dataframes(start, stop, 'tokens', 'all_tokens')
join_dataframes(start, stop, 'tokens', 'tokens_term_freq')
join_dataframes(start, stop, 'bigrams', 'all_bigrams')
join_dataframes(start, stop, 'bigrams', 'bigrams_term_freq')

## Aspect Selection

Input di questa fase: liste di unigrammi e bigrammi estratti per ogni film e relativa term frequency;

Output di questa fase: file per il popolamento del db di ConveRSE.

### Generazione lista di film da escludere
Viene generata la lista di film da escludere.
Per decidere quali film escludere settare i parametri tokens_delete_threshold e bigrams_delete_threshold.
Il valore assegnato ai parametri serve ad escludere i film con un numero di unigrammi\bigrammi al di sotto della soglia scelta.

In [None]:
#Unigrammi
tokens_delete_threshold = 150 # escludi i film con meno di 150 unigrammi estratti

df = pd.read_csv('dataset_movie_list.txt', sep='|', names=['movie_id']) # lista degli id dei film per i quali ci sono recensioni
rpf = pd.read_csv('reviewsPerFilm.txt', sep='|', names=['movie_id', 'n_reviews']) # lista con id dei film associati al numero di recensioni
df = df.merge(rpf, on='movie_id')

movie_id_list = df['movie_id'].unique()

rows_list = []
for row in df.itertuples():
  path = os.path.join(str(row.movie_id), str('tokens'))
  n_tokens = 1
  try:
    mdf = pd.read_csv(os.path.join(path, str('extracted_tokens.txt')), sep='|', names=['tokens'])
  except OSError as e:
    n_tokens = 0

  if n_tokens == 1:
    n_tokens = len(mdf.index)

  n_reviews = row.n_reviews
  dict1 = {}
  dict1.update({'movie_id': str(row.movie_id)})
  dict1.update({'n_reviews': n_reviews})
  dict1.update({'n_tokens': n_tokens})
  rows_list.append(dict1)

df = pd.DataFrame(rows_list)
df.drop(df[(df.n_tokens >= 150)].index, inplace=True)
df.sort_values(by='n_reviews', inplace=True, ascending=False)
df.to_csv('no_tokens_list.txt', sep='|', index=None, columns=['movie_id'])

# Bigrammi
bigrams_delete_threshold = 150 # escludi i film con meno di 150 bigrammi estratti

df= pd.read_csv('dataset_movie_list.txt', sep='|', names=['movie_id'])
rpf = pd.read_csv('reviewsPerFilm.txt', sep='|', names=['movie_id', 'n_reviews'])
df = df.merge(rpf, on='movie_id')

movie_id_list = df['movie_id'].unique()

rows_list = []
for row in df.itertuples():
  path = os.path.join(str(row.movie_id), str('bigrams'))
  n_bigrams = 1
  try:
    mdf = pd.read_csv(os.path.join(path, str('extracted_bigrams.txt')), sep='|', names=['bigrams'])
  except OSError as e:
    n_bigrams = 0

  if n_bigrams == 1:
    n_bigrams = len(mdf.index)

  n_reviews = row.n_reviews
  dict1 = {}
  dict1.update({'movie_id': str(row.movie_id)})
  dict1.update({'n_reviews': n_reviews})
  dict1.update({'n_bigrams': n_bigrams})
  rows_list.append(dict1)

df = pd.DataFrame(rows_list)
df.drop(df[(df.n_bigrams >= 150)].index, inplace=True)
df.sort_values(by='n_reviews', inplace=True, ascending=False)
df.to_csv('no_bigrams_list.txt', sep='|', index=None, columns=['movie_id'])


# Merge liste
tok_del_list = pd.read_csv('no_tokens_list.txt', sep='|')
big_del_list = pd.read_csv('no_bigrams_list.txt', sep='|')

delete_list_df = tok_del_list.merge(big_del_list, on="movie_id", how='outer')
delete_list_df.to_csv('delete_list.txt', sep='|', index = None, header = None)

### Document Frequency e TF-IDF
A questo punto, dato che sono stati definiti i film da escludere dall'estrazione, è possibile calcolare la Document Frequency e la TF-IDF.

In [None]:
dataset_movie_list_df = pd.read_csv('dataset_movie_list.txt', sep='|', names=['movie_id'])
delete_list_df = pd.read_csv('delete_list.txt', sep='|', names=['movie_id'])
movie_id_list = dataset_movie_list_df['movie_id'].unique()
delete_list = delete_list_df['movie_id'].unique()
movies_number = len(movie_id_list)
print('movies_number: '+ str(movies_number) + '\n')
print('delete_list_number: ' + str(len(delete_list)) + '\n')

tokens_df = pd.read_csv('all_tokens_dataset.txt', sep='|')
bigrams_df = pd.read_csv('all_bigrams_dataset.txt', sep='|')
tokens_term_freq = pd.read_csv('tokens_term_freq_dataset.txt', sep='|')
bigrams_term_freq = pd.read_csv('bigrams_term_freq_dataset.txt', sep='|')

#drop the movies in the delete list
for movie_id in delete_list:
  dataset_movie_list_df.drop(dataset_movie_list_df[(dataset_movie_list_df.movie_id == str(movie_id))].index, inplace=True)
  tokens_df.drop(tokens_df[(tokens_df.movie_id == str(movie_id))].index, inplace=True)
  bigrams_df.drop(bigrams_df[(bigrams_df.movie_id == str(movie_id))].index, inplace=True)
  tokens_term_freq.drop(tokens_term_freq[(tokens_term_freq.movie_id == str(movie_id))].index, inplace=True)
  bigrams_term_freq.drop(bigrams_term_freq[(bigrams_term_freq.movie_id == str(movie_id))].index, inplace=True)

movie_id_list = dataset_movie_list_df['movie_id'].unique()
movies_number = len(movie_id_list)
print('movies_number after removing movies from delete_list: '+ str(movies_number) + '\n')

df_tfidf(movie_id_list, tokens_df, bigrams_df, tokens_term_freq, bigrams_term_freq, movies_number) #sistemare qua dentro i bigrammi

### Estrazione top k aspetti
Una volta che abbiamo Document Frequency e TF-IDF possiamo far girare questa funzione che estrarrà per ogni film i top k token e bigrammi e calcolerà l'overlapping. La funzione per la trasformazione dei "bigrammi positivi" viene chiamata qui dentro subito dopo aver estratto i top k bigrammi.
Il numero di aspetti da estrarre può essere selezionato modificando il parametro k.

In [None]:
k = 100

top_k_and_overlapping(movie_id_list, k)

### Rimozione aspetti poco comuni
Vengono infine rimossi gli aspetti "poco comuni", è possibile settare la soglia di popolarità necessaria alla rimozione settando il parametro popularity_threshold: in questo modo gli aspetti che compaiono in meno di tale numero di film verranno rimossi.

In [None]:
popularity_threshold = 13

# Unigrammi
tok_over_df = pd.read_csv('tokens_overlapping_list_' + str(k) + '.txt', sep='|')
tok_over_df.drop(tok_over_df[(tok_over_df.occurrences < popularity_threshold)].index, inplace=True)
tok_over_df.reset_index()
tok_over_df.to_csv('tokens_overlapping_list_clean.txt', sep='|', index=None)

# Bigrammi
big_over_df = pd.read_csv('bigrams_overlapping_list_' + str(k) + '.txt', sep='|')
big_over_df.drop(big_over_df[(big_over_df.occurrences < popularity_threshold)].index, inplace=True)
big_over_df.reset_index()
big_over_df.to_csv('bigrams_overlapping_list_clean.txt', sep='|', index=None)

### Mapping

Questa funzione crea i file utilizzati per popolare la Knowledge Base di ConveRSE.

In [None]:
map_aspect_terms(movie_id_list, k, map_tokens=True, map_bigrams=True)

### Crea lista unificata
Unisce i file per il popolamento della Knowledge Base di ConveRSE 

In [None]:
tokens_list = pd.read_csv("tokens_list.txt", sep='|', names=['id', 'term'])
bigrams_list = pd.read_csv("bigrams_list.txt", sep='|', names=['id', 'term'])
unified_list = pd.concat([tokens_list, bigrams_list])
unified_list.to_csv('unified_list.txt', sep='|', index=None, header=None)

tokens_mapping = pd.read_csv("tokens_mapping.txt", sep='|', names=['movie_id', 'property_type', 'aspect_id'])
bigrams_mapping = pd.read_csv("bigrams_mapping.txt", sep='|', names=['movie_id', 'property_type', 'aspect_id'])
unified_mapping = pd.concat([tokens_mapping, bigrams_mapping])
unified_mapping.to_csv('unified_mapping.txt', sep='|', index=None, header=None)

## Modifica label degli aspetti che hanno la stessa label di un film

### File per il popolamento della KB di ConveRSE

In [None]:
# unigrammi
tokens_list = pd.read_csv("tokens_list.txt", sep='|', names=['id', 'term'])
edited_tokens_list = edit_aspects_label(tokens_list, 'token')
edited_tokens_list.to_csv('tokens_list.txt', index=None, header=None, sep='|')

# bigrammi
bigrams_list = pd.read_csv("bigrams_list.txt", sep='|', names=['id', 'term'])
edited_bigrams_list = edit_aspects_label(bigrams_list, 'bigram')
edited_bigrams_list.to_csv('bigrams_list.txt', index=None, header=None, sep='|')

# unigrammi e bigrammi
tokens_list = pd.read_csv("tokens_list.txt", sep='|', names=['id', 'term'])
bigrams_list = pd.read_csv("bigrams_list.txt", sep='|', names=['id', 'term'])
unified_list = pd.concat([tokens_list, bigrams_list])
unified_list.to_csv('unified_list.txt', sep='|', index=None, header=None)

# Conclusione
I file necessari per il popolamento del database di ConveRSE sono: 
*   tokens_list: contiene la lista degli unigrammi estratti corredati dal loro ID;
*   tokens_mapping: contiene la lista degli ID dei film associati agli ID degli unigrammi;
*   bigrams_list: contiene la lista dei bigrammi estratti corredati dal loro ID;
*   bigrams_mapping: contiene la lista degli ID dei film associati agli ID dei bigrammi;
*   unified_list: unione di tokens_list e bigrams_list;
*   unified_mapping: unione di tokens_mapping e bigrams_mapping;

Per popolare il database di ConveRSE lanciare la classe dbPopulator.DatabasePopulator in ConveRSETools passando i seguenti argomenti:
-c percordo del file configuration.json -p percorso della lista di aspetti da utilizzare -m percorso del file contenente gli ID dei film associati agli ID degli aspetti.

Es: -c C:\Users\alema\git\MovieRecSysService\resources\configuration.json -p C:\Users\alema\git\unified_list.txt -m C:\Users\alema\git\unified_mapping.txt
