In [7]:
import nltk
nltk.download('punkt')
import math

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
dataset = {
    "tfidf_1.txt":open("tfidf_1.txt").read(),
    "tfidf_2.txt":open("tfidf_2.txt").read(),
    "tfidf_3.txt":open("tfidf_3.txt").read(),
    "tfidf_4.txt":open("tfidf_4.txt").read(),
    "tfidf_5.txt":open("tfidf_5.txt").read(),
    "tfidf_6.txt":open("tfidf_6.txt").read(),
    "tfidf_7.txt":open("tfidf_7.txt").read(),
    "tfidf_8.txt":open("tfidf_8.txt").read(),
    "tfidf_9.txt":open("tfidf_9.txt").read(),
    "tfidf_10.txt":open("tfidf_10.txt").read()
}

In [9]:
# Calculate term frequencies
def tf(dataset, file_name):
    text = dataset[file_name]
    tokens = nltk.word_tokenize(text)
    fd = nltk.FreqDist(tokens)
    return fd

In [10]:
# Calculate inverse document frequency
def idf(dataset, term):
    count = [term in dataset[file_name] for file_name in dataset]
    inv_df = math.log(len(count)/sum(count))
    return inv_df

In [11]:
def tfidf(dataset, file_name, n):
    term_scores = {}
    file_fd = tf(dataset,file_name)
    for term in file_fd:
        if term.isalpha():
            idf_val = idf(dataset,term)
            tf_val = tf(dataset, file_name)[term]
            tfidf = tf_val*idf_val
            term_scores[term] = round(tfidf,2)
    return sorted(term_scores.items(), key=lambda x:-x[1])[:n]

In [12]:
tf (dataset,"tfidf_1.txt")

FreqDist({"'": 1,
          "''": 1,
          "'s": 2,
          '(': 4,
          ')': 4,
          ',': 54,
          '.': 27,
          '1': 1,
          '100': 1,
          '11': 1,
          '15': 1,
          '1937': 1,
          '1939': 4,
          '1940': 1,
          '1941': 3,
          '1942': 1,
          '1943': 1,
          '1944': 2,
          '1945': 5,
          '26': 1,
          '30': 1,
          '46': 1,
          '50': 1,
          '6': 1,
          '8': 1,
          '85': 1,
          '9': 1,
          ':': 1,
          'Africa': 4,
          'Allied': 3,
          'Allies': 6,
          'Asia': 4,
          'Atlantic': 1,
          'August': 4,
          'Axis': 7,
          'Baltic': 1,
          'Based': 1,
          'Battle': 3,
          'Berlin': 1,
          'Blitz': 1,
          'Britain': 1,
          'British': 1,
          'Burma': 1,
          'Central': 1,
          'China': 3,
          'Cold': 1,
          'Commonwealth': 1,
          'Council': 

In [13]:
tf (dataset,"tfidf_2.txt")

FreqDist({"''": 2,
          "'s": 2,
          '(': 5,
          ')': 5,
          ',': 21,
          '.': 12,
          '02:56': 1,
          '11': 3,
          '16': 1,
          '1961': 1,
          '1969': 1,
          '1⁄2': 1,
          '20': 1,
          '20:18': 1,
          '21': 2,
          '21.5': 1,
          '24': 1,
          '47.5': 1,
          ':': 2,
          ';': 2,
          'After': 2,
          'Aldrin': 4,
          'Americans': 1,
          'Apollo': 5,
          'Armstrong': 6,
          'Broadcast': 1,
          'Buzz': 1,
          'CM': 1,
          'Center': 1,
          'Collins': 2,
          'Command': 3,
          'Congress': 1,
          'Earth': 5,
          'F.': 1,
          'Florida': 1,
          'Island': 1,
          'John': 1,
          'July': 4,
          'Kennedy': 2,
          'LM': 1,
          'Launched': 1,
          'Lunar': 3,
          'Merritt': 1,
          'Michael': 1,
          'Module': 7,
          'Moon': 4,
          'NASA

In [14]:
tfidf(dataset,"tfidf_1.txt",10)

[('Soviet', 20.72),
 ('Union', 18.42),
 ('Axis', 16.12),
 ('Japan', 11.27),
 ('Germany', 11.27),
 ('Allies', 9.66),
 ('invasion', 9.66),
 ('World', 9.21),
 ('Asia', 9.21),
 ('Africa', 9.21)]

In [15]:
for file_name in dataset:
    print("{0}: \n {1} \n".format(file_name, tfidf(dataset,file_name,20)))

tfidf_1.txt: 
 [('Soviet', 20.72), ('Union', 18.42), ('Axis', 16.12), ('Japan', 11.27), ('Germany', 11.27), ('Allies', 9.66), ('invasion', 9.66), ('World', 9.21), ('Asia', 9.21), ('Africa', 9.21), ('Japanese', 9.21), ('million', 8.05), ('Pacific', 8.05), ('II', 6.91), ('atomic', 6.91), ('China', 6.91), ('Kingdom', 6.91), ('surrender', 6.91), ('United', 6.24), ('European', 6.02)] 

tfidf_2.txt: 
 [('Module', 16.12), ('Armstrong', 13.82), ('lunar', 13.82), ('Apollo', 11.51), ('Moon', 9.21), ('Aldrin', 9.21), ('spacecraft', 9.21), ('Earth', 8.05), ('surface', 6.91), ('Lunar', 6.91), ('landed', 6.44), ('hours', 4.83), ('Command', 4.83), ('UTC', 4.61), ('Collins', 4.61), ('orbit', 4.61), ('returned', 4.61), ('Saturn', 4.61), ('Kennedy', 4.61), ('Space', 4.61)] 

tfidf_3.txt: 
 [('Napoleon', 32.19), ('French', 16.86), ('Coalition', 11.51), ('Prussia', 6.91), ('military', 6.02), ('Revolution', 6.02), ('Battle', 6.02), ('against', 5.5), ('France', 4.85), ('Europe', 4.85), ('army', 4.83), ('isl