## Implementing a TF-IDF score calculator from scratch (more or less)

---

In [1]:
def createTextList(path):
    import os
    ## make a list of the files
    filepath = [(path + os.listdir(path)[i]) for i in range(len(os.listdir(path)))]
    ## create a list of all the texts extracted from each file
    texts_list = []
    for file in filepath:
        with open(file) as f:
            text = f.readlines()
            texts_list.append(text[0])
    return texts_list

In [2]:
## remove special characters / marks / punctuation
def removeMarks(word):
    marks = '''!()[]{}<>;?@#$%:'"\,./^&*_0123456789'''
    for char in word:
        if char in marks:
            word = word.replace(char, '')
    return word

## read and prepare file with stopwords, downloaded from an online list
def getStopwords(filename='stopwords.txt'):
    with open(filename) as f:
        text = f.readlines()
    stopwords = []
    for word in text:
        for char in ['\n', '\t']:
            if char in word:
                word = word.replace(char, '')
        stopwords.append(word)
    return stopwords

## remove stopwords
def removeStopwords(texts_list):
    stopwords = getStopwords()
    texts_without_stopwords = []
    for text in texts_list:
        single_text_without_stopwords = [word for word in text if word.lower() not in stopwords]
        texts_without_stopwords.append(single_text_without_stopwords)
    return texts_without_stopwords

## remove marks and stopwords 
def removeMarksAndStopwords(texts_list, stem=True, stemmer='snowball'): 
    ## remove special marks and punctuation
    cleaned_texts_list = [[removeMarks(word) for word in text.split()] for text in texts_list]
    ## remove instances of empty strings like '' or ""
    cleaned_texts_list = [[word for word in text if word] for text in cleaned_texts_list]
    ## remove stopwords
    cleaned_texts_list = removeStopwords(cleaned_texts_list)
    
    ## stem the words
    if stem:
        
        ## set the stemmer type
        if stemmer == 'lancaster':
            from nltk.stem.lancaster import LancasterStemmer
            st = LancasterStemmer()
        elif stemmer == 'porter':
            from nltk.stem.porter import PorterStemmer
            st = PorterStemmer()
        else:
            from nltk.stem.snowball import SnowballStemmer
            st = SnowballStemmer('english')
        
        ## stem each word in each text
        cleaned_and_stemmed_texts_list = [[st.stem(word) for word in txt] for txt in cleaned_texts_list]
        return cleaned_and_stemmed_texts_list
        
    return cleaned_texts_list

In [3]:
## TF calculation
def getTF(text):
    tf_dict = {}
    for word in text:
        if word.lower() not in tf_dict.keys():
            tf_dict[word.lower()] = 1
        else:
            tf_dict[word.lower()] += 1
    for key,value in tf_dict.items():
        tf_dict[key] = value / len(text)
    return tf_dict

## calculate TF scores
def tfScores(cleaned_texts_list):
    tf_list = [getTF(text) for text in cleaned_texts_list]
    documents_list = [list(tf.keys()) for tf in tf_list]
    return tf_list, documents_list

## calculate IDF
def getIDF(cleaned_texts_list):
    import math
    tf_list, documents_list = tfScores(cleaned_texts_list) 
    df_dict = {}
    count = 0 
    for term in tf_list:
        for key in list(term.keys()):
            for document in documents_list:
                if key in document:
                    count += 1
            if key not in df_dict.keys():
                df_dict[key] = math.log(len(documents_list) / count) + 1
            count = 0
    return tf_list, df_dict, documents_list

## calculate TF-IDF
def getTFIDF(cleaned_texts_list):
    tf_list, df_dict, documents_list = getIDF(cleaned_texts_list)
    tf_idf_dict = {}
    for key,value in tf_list[0].items():
        if key not in tf_idf_dict.keys():
            tf_idf_dict[key] = value * df_dict[key]
    tf_idf_dict_sorted = dict(sorted(tf_idf_dict.items(), key=lambda item: item[1], reverse=True))
    return tf_idf_dict_sorted, documents_list

## TF-IDF summary table
def tfidfSummary(cleaned_texts_list):
    import pandas as pd
    tf_idf_dict_sorted, documents_list = getTFIDF(cleaned_texts_list)
    appearance_count = {}
    for key,value in tf_idf_dict_sorted.items():
        count = 0
        for document in documents_list:
            if key in document:
                count += 1
        appearance_count[key] = [value, count]
    pd.set_option('display.max_rows', len(tf_idf_dict_sorted.keys()))
    tf_idf_summary = pd.DataFrame(appearance_count, index=['TF-IDF_score','#_documents_word_appears']).T
    return tf_idf_summary

---

The texts in the 'files' folder are taken from the Wikipedia English page for _Natural Language Processing_ (https://en.wikipedia.org/wiki/Natural_language_processing). They are used here only for the sake of the example.

In [4]:
## create a list of the texts to analyze
## the txt files with the texts are stored in the 'files' folder
texts = createTextList('./files/')

In [5]:
## remove special marks, punctuation and stopwords
## optional: word stemming, chossing among 3 different NLTK stemmers
cleaned_texts_list = removeMarksAndStopwords(texts,
                                             stem=True # stem the words, use default stemmer (Snowball)
                                             )

In [6]:
## calculate TF-IDF score
tf_idf_summary = tfidfSummary(cleaned_texts_list)
tf_idf_summary

Unnamed: 0,TF-IDF_score,#_documents_word_appears
document,0.145882,2.0
languag,0.12906,13.0
comput,0.123632,6.0
natur,0.113344,10.0
understand,0.084175,3.0
process,0.073104,9.0
subfield,0.059807,1.0
scienc,0.059807,1.0
concern,0.059807,1.0
program,0.059807,1.0


Next step to do: 

Implement an option to calculate a TF-IDF score for n-grams, rather than for single words.

---