# ESECUZIONE

In [40]:
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 28 17:45:00 2020

Modulo per calcolare le varie metriche/attributi necessari
per stabilire se un testo è stato scritto da un autore

@author: michele
"""
'''import pyspark
sc = pyspark.SparkContext('local[*]')'''

def prob_of_the_most_common_word(word_count):
    '''
    prob_of_the_most_common_word: funzione che ritorna la probabilità della parola più
                                  comune escudendo "the" e "and"
    '''
    # distribuzione di probabilità
    pd = prob_distr_of_30_most_common_words(word_count)
    
    for couple in pd:
        if couple[0] != "and" and couple[0] != "the":
            return couple

def prob_of_the_most_common_word(RDD_word_counter, text_len):
    return sc.parallelize(prob_distr_of_30_most_common_words(RDD_word_counter, text_len)
                          .filter(lambda x: x[0] != "and" and x[0] != "the")
                          .take(1)
                         )

def prob_of_The(RDD_word_counter, text_len):
    return (RDD_word_counter.filter(lambda x: x[0] == "the")
           .map(lambda x: (x[0], x[1]/text_len))
           )

def prob_distr_of_30_most_common_words(RDD_word_counter, text_len):
    # probability distribution
    return sc.parallelize(RDD_word_counter.map(lambda x: (x[0], x[1]/text_len)).take(30))

def hentropy(RDD_word_counter, text_len):
    import math
    
    return (RDD_word_counter.map(lambda x: ("hentropy", (x[1]/text_len) * math.log2(x[1]/text_len)))
                            .reduceByKey(lambda a,b: a+b)
                            .map(lambda x: -x[1])    # entropia ha segno negativo
           )

def text_length_in_words(RDD_word_counter):
    
    # word_counter: [("word1", 100), ...]
    
    return (RDD_word_counter.map(lambda x: ("text_length", x[1]))
                            .reduceByKey(lambda a,b: a+b)
                            .map(lambda x: x[1])
           )

def word_counter(RDD):
    word_counter = (RDD.flatMap(lambda x: x)
                .map(lambda x: (x,1))
                .reduceByKey(lambda a,b: a+b)
                .sortBy(lambda x: -x[1])
               )
    return word_counter, len(word_counter.collect())


def getCollection(RDD):
    return RDD.collect()

def getValue(RDD):
    return RDD.collect()[0]


def remove_number_some_punctuation_marks(row):

    lowercase = row.lower()
    lowercase = lowercase.replace("--", " ")
    
    res = ""
    
    for char in lowercase:
        if not ('0' <= char <= '9' or char == '"'):
            res += char

    return res

def remove_number_punctuation_marks(row):
    
    lowercase = row.lower()
    lowercase = lowercase.replace("--", " ")
    
    res = ""
    
    for char in lowercase:
        if 'a' <= char <= 'z' or char == ' ' or char == '-' or char == "'":
            res += char

    return res

def load_file_without_punctuations_marks(filepath):
    # caricamento del dataset
    raw_text = sc.textFile(filepath)

    # rimuoviamo i numeri e i segni di punteggiatura
    
    return (raw_text.filter(bool)                    # rimuoviamo le stringhe vuote
        .map(remove_number_punctuation_marks)
        .map(lambda x : ' '.join(x.split()))        # rimuoviamo diversi spazi bianchi con uno
        .map(lambda row : row.split(" "))
       )

def load_file_without_number(filepath):
    # caricamento del dataset
    raw_text = sc.textFile(filepath)

    # rimuoviamo i numeri e i segni di punteggiatura
    
    return (raw_text.filter(bool)                    # rimuoviamo le stringhe vuote
        .map(remove_number_some_punctuation_marks)
        .map(lambda x : ' '.join(x.split()))        # rimuoviamo diversi spazi bianchi con uno
        .map(lambda row : row.split(" "))
       )

In [23]:
if __name__ == "__main__":
    print("Caricamento del file ... ", end=" ")
    data = load_file_without_punctuations_marks("datasets/Anthony Trollope___The O'Conors of Castle Conor from Tales from all Countries.txt")
    print("caricamento completato")
    
    # POSIAMO I DATI NELLA CACHE
    data.persist()
    
    print("Calcoliamo l'RDD del word_counter ... ", end=" ")
    RDD_word_counter, vocabulary_size = word_counter(data)
    print("calcolo completato")
    
    print("text_length_in_word ... ", end=" ")
    RDD_text_length = text_length_in_words(RDD_word_counter)
    print(RDD_text_length.collect())
    
    print("Rapporto V/T: ", vocabulary_size/getValue(RDD_text_length))
    
    print("Calcolo entropia ... ", end=" ")
    RDD_hentropy = hentropy(RDD_word_counter, getValue(RDD_text_length))
    print(getValue(RDD_hentropy))
    

Caricamento del file ...  caricamento completato
Calcoliamo l'RDD del word_counter ...  calcolo completato
text_length_in_word ...  [7653]
Rapporto V/T:  0.20358029530902913
Calcolo entropia ...  8.605861060321123


In [41]:
    print("Calcolo della distribuzione di probabilità delle 30 parole più comuni ...", end=" ")
    RDD_prob_distr_of_30 = prob_distr_of_30_most_common_words(RDD_word_counter, getValue(RDD_text_length))
    print(getCollection(RDD_prob_distr_of_30), end="\n\n")
    
    print("Calcolo della probabilità di the ...", end=" ")
    RDD_prob_the = prob_of_The(RDD_word_counter, getValue(RDD_text_length))
    print(getValue(RDD_prob_the))
    
    print("Calcolo della probabilità della parola più comune escluso the e and ...", end=" ")
    RDD_prob_the_most_common_word = prob_of_the_most_common_word(RDD_word_counter, getValue(RDD_text_length))
    print(getValue(RDD_prob_the_most_common_word))
    

Calcolo della distribuzione di probabilità delle 30 parole più comuni ... [('the', 0.04494969293087678), ('i', 0.036717627074349925), ('and', 0.033320266562132494), ('to', 0.023650855873513656), ('of', 0.021298837057363126), ('a', 0.019730824513262774), ('my', 0.01620279628903698), ('that', 0.016072128577028617), ('in', 0.015941460865020254), ('was', 0.01528812230497844), ('said', 0.012152097216777734), ('as', 0.011106755520710831), ('he', 0.011106755520710831), ('at', 0.009930746112635568), ('but', 0.009669410688618842), ('you', 0.009277407552593753), ('for', 0.009146739840585392), ('me', 0.00862406899255194), ('it', 0.00862406899255194), ('had', 0.007840062720501764), ('with', 0.00718672416045995), ('not', 0.006533385600418136), ("o'conor", 0.006272050176401411), ('all', 0.006141382464393048), ('his', 0.005880047040376323), ('were', 0.00574937932836796), ('on', 0.005618711616359597), ('so', 0.004965373056317784), ('there', 0.004704037632301058), ('we', 0.004704037632301058)]

Calcolo