### Question 1 - [40 Points] Scoring and Term-Weighting

##### Import Files

In [1]:
from collections import defaultdict , Counter
from os import listdir
from os.path import isfile, join
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer as ps
from nltk.stem import WordNetLemmatizer as wl
from nltk.corpus import stopwords
import re
import math

#####  Preprocessing Data

In [2]:
# Preprocessing 
def preprocess(sent):
    result = sent.lower() # Converting to lower case
    result = re.sub(r'[^\w\s]','',result) # remove punctuation
    result = " ".join(result.split()) #remove blank spaces
    result = word_tokenize(result) # Word tokenizer
    result = [word for word in result if word not in stopword] # removing stopwords
    return result

##### Reading Files

In [3]:
mypath = 'dataset/'
files = [f for f in listdir(mypath) if isfile(join(mypath, f))] # all file names present in the folder
print("Total Files" , len(files))
fileSet = set(files)   
stopword = set(stopwords.words()) # set of stopwords

Total Files 1133


In [4]:
filesp = {}  # key:file name , value file content
for i in files:
    f =open('dataset/'+i, "r", encoding='cp850')
    filesp[i] = set(preprocess(f.read()))  # preprocessing the data

### Jaccard Coefficient [20 points]

In [5]:
def jac_coeff(query):
    query = set(preprocess(query))
    ans = []
    for i in filesp:
        uni = filesp[i].union(query) # union
        inter = filesp[i].intersection(query) # intesection
        ans.append( (i , len(inter) / len(uni) , inter , len(uni))) # Jaccard Coefficient
    ans.sort(key = lambda x:-x[1])
    print("TOP 5 DOCUMENTS ARE:-")
    print("File name   jaccard coefficient   words_present    total words")
    for i in range(5): # TOP 5 relevant queries with their jaccard Coefficient
        print(ans[i])
    print('\n\n')

In [6]:
for test in range(int(input("Enter number of Queries:  "))):
    jac_coeff(input())

Enter number of Queries:  4
cold water
TOP 5 DOCUMENTS ARE:-
File name   jaccard coefficient   words_present    total words
('pasta001.sal', 0.022222222222222223, {'water', 'cold'}, 90)
('japice.bev', 0.0196078431372549, {'water'}, 51)
('orgfrost.bev', 0.01818181818181818, {'cold'}, 55)
('montoys.txt', 0.017857142857142856, {'water'}, 56)
('antimead.bev', 0.017543859649122806, {'water', 'cold'}, 114)



bad boy
TOP 5 DOCUMENTS ARE:-
File name   jaccard coefficient   words_present    total words
('buffwing.pol', 0.01818181818181818, {'bad'}, 55)
('childrenbooks.txt', 0.015384615384615385, {'boy', 'bad'}, 130)
('normal.boy', 0.012195121951219513, {'boy'}, 82)
('normalboy.txt', 0.012195121951219513, {'boy'}, 82)
('forsooth.hum', 0.011904761904761904, {'boy'}, 84)



king kong
TOP 5 DOCUMENTS ARE:-
File name   jaccard coefficient   words_present    total words
('jrrt.riddle', 0.010638297872340425, {'king'}, 94)
('smokers.txt', 0.008, {'king'}, 125)
('yogisays.txt', 0.006369426751592357, {'

### TF-IDF Matrix [20 points]

In [7]:
idf = Counter() # IDF matrix
files_p = {}  
for i in files:
    f =open('dataset/'+i, "r", encoding='cp850')
    files_p[i] = preprocess(f.read())  # preprocessing the data 
    for i in set(files_p[i] ):
        idf[i] += 1  
for i in idf:
    idf[i] = math.log(len(files)/(idf[i] + 1 ))  # IDF(word)=log(total no. of documents/document frequency(word)+1)
vocab = set(idf.keys()) # vocabulary

In [8]:
def binary_tfidf():  # Binary
    tf = {}
    tf_idf = defaultdict(Counter)
    for i in files:
        f = files_p[i]
        tf[i] = Counter(list(set(f)))
        for j in tf[i]:
            tf_idf[i][j] = tf[i][j] * idf[j]
    return tf_idf
b_tfidf = binary_tfidf()

In [9]:
def raw_tfidf(): # Raw count
    tf = {}
    tf_idf = defaultdict(Counter)
    for i in files:
        f = files_p[i]
        tf[i] = Counter(f)
        for j in tf[i]:
            tf_idf[i][j] = tf[i][j] * idf[j]
    return tf_idf
r_tfidf = raw_tfidf()

In [10]:
def term_freq_tf_idf():# Term frequency
    tf = {}
    tf_idf = defaultdict(Counter)
    for i in files:
        f = files_p[i]
        tf[i] = Counter(f)
        totalwords = len(f)
        for j in tf[i]:
            tf_idf[i][j] = tf[i][j] * idf[j] / totalwords
    return tf_idf
tf_tfidf = term_freq_tf_idf()

In [11]:
def ln_tf_idf(): # Log normalization
    tf = {}
    tf_idf = defaultdict(Counter)
    for i in files:
        f = files_p[i]
        tf[i] = Counter(f)
        for j in tf[i]:
            tf_idf[i][j] = math.log( tf[i][j] + 1 ) * idf[j]
    return tf_idf
ln_tfidf =  ln_tf_idf()

In [12]:
def dn_freq_tf_idf(): # Double normalization
    tf = {}
    tf_idf = defaultdict(lambda: defaultdict(lambda: 0.5))
    for i in files:
        f = files_p[i]
        tf[i] = Counter(f)
        maxwords = max(list(tf[i].values()))
        for j in tf[i]:
            tf_idf[i][j] +=  0.5 * (tf[i][j] * idf[j] / maxwords)
    return tf_idf
dn_tfidf = dn_freq_tf_idf()

In [13]:
def evaluate(tf_idf , query):  # Function for evaluating the query
    ans = []
    query = {i for i in query if query[i] == 1}
    for i in tf_idf:
        score = 0
        for j in query:
            score += tf_idf[i][j] 
        ans.append((i , score))
    ans.sort(key = lambda x:-x[1])
    for i in range(5):
        print(ans[i])

In [14]:
for test in range(int(input("Enter number of Queries:  "))):
    query = set(preprocess(input("Enter Query: ")))
    query_vector = {}
    for i in vocab:
        if i in query:
            query_vector[i] = 1 
        else:
            query_vector[i] = 0 
    
    print("For  Binary Weighting Scheme")
    evaluate(b_tfidf , query_vector)
    print("\n\nFor  Raw count Weighting Scheme")
    evaluate(r_tfidf , query_vector)
    print("\n\nFor  Term frequency Weighting Scheme")
    evaluate(tf_tfidf , query_vector)
    print("\n\nFor  Log normalization Weighting Scheme")
    evaluate(ln_tfidf , query_vector)
    print("\n\nFor Double normalization Weighting Scheme")
    evaluate(dn_tfidf , query_vector)
    print("\n\n\n\n")

Enter number of Queries:  3
Enter Query: king kong
For  Binary Weighting Scheme
('cast.lis', 6.18671432591565)
('classicm.hum', 6.18671432591565)
('commutin.jok', 6.18671432591565)
('consp.txt', 6.18671432591565)
('drunk.txt', 6.18671432591565)


For  Raw count Weighting Scheme
('epi_merm.txt', 163.8946738319551)
('grail.txt', 60.61857799264093)
('blackadd', 26.941590218951525)
('pun.txt', 20.206192664213642)
('quest.hum', 17.961060145967682)


For  Term frequency Weighting Scheme
('epi_merm.txt', 0.0507413850872926)
('hotel.txt', 0.03675134552605772)
('yogisays.txt', 0.025368729019728364)
('pun.txt', 0.025226208070179328)
('epitaph', 0.025057282569709377)


For  Log normalization Weighting Scheme
('epi_merm.txt', 9.663196501400012)
('grail.txt', 7.481240703240202)
('insult.lst', 7.442684760707859)
('classicm.hum', 6.796800384929975)
('episimp2.txt', 6.3454977102363115)


For Double normalization Weighting Scheme
('hotel.txt', 2.313860602556564)
('pun.txt', 1.841924694342235)
('blooprs

#### Note: Answers for binary weighting scheme will vary a lot because most of the documents have same tf_idf score.
#### In all other queries, answers may differ a little bit because of the preprocessing part.