In [2]:
import numpy as np
import pandas as pd
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from time import time
from collections import Counter


In [10]:
class CalculateTFIDF:
    
    if __name__ == '__main__':
        
        trainData = pd.read_csv('../LematizedFiles/trainlem.csv', engine='python')
        testData = pd.read_csv('../LematizedFiles/testlem.csv', engine='python')

        def get_weight(count, eps=10000, min_count=2):
            if count < min_count:
                return 0
            else:
                return 1 / (count + eps)


        eps = 5000 
        words = (" ".join(trainData['lem_question1'])).lower().split()
        counts = Counter(words)
        words2 = (" ".join(trainData['lem_question2'])).lower().split()
        counts2 = Counter(words2)
        totalcount = counts + counts2
        weights = {word: get_weight(count) for word, count in totalcount.items()}

        print('Most common words and weights: \n')
        check_list = sorted(weights.items(), key=lambda x: x[1] if x[1] > 0 else 9999)[:45]
        stops = [i[0] for i in check_list]
        print(stops)



        print('\nLeast common words and weights: ')
        (sorted(weights.items(), key=lambda x: x[1], reverse=True)[:10])
        
        
        def tfidf_word_match_share(row):
            q1words = {}
            q2words = {}
            for word in str(row['lem_question1']).lower().split():
                if word not in stops:
                    q1words[word] = 1
            for word in str(row['lem_question2']).lower().split():
                if word not in stops:
                    q2words[word] = 1
            if len(q1words) == 0 or len(q2words) == 0:
                # The computer-generated chaff includes a few questions that are nothing but stopwords
                return 0

            shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
            total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]

            R = np.sum(shared_weights) / np.sum(total_weights)
            return R
        
        
        tfidf_train_word_match = trainData.apply(tfidf_word_match_share, axis=1, raw=True)
        tfidf_test_word_match = testData.apply(tfidf_word_match_share, axis=1, raw=True)
        
        
        trainData['lem_tfidf_word_match'] = tfidf_train_word_match
        testData['lem_tfidf_word_match'] = tfidf_test_word_match

        print(tfidf_test_word_match)

        print(trainData.info())


        trainData.to_csv('../LematizedFiles/trainlem.csv', index = False)
        testData.to_csv('../LematizedFiles/testlem.csv', index = False)
        
        

Most common words and weights: 

['?', 'be', 'the', 'what', 'do', 'a', 'i', 'how', 'to', 'in', 'of', 'and', 'can', 'for', ',', 'you', 'why', 'it', 'my', 'best', 'have', 'on', 'is', 'get', '.', 'or', 'which', 'if', 'some', 'that', 'with', 'should', "'s", 'an', 'from', 'your', 'good', 'india', 'will', 'make', 'like', 'people', 'when', 'who', ')']

Least common words and weights: 




0      0.708399
1      0.480824
2      0.000000
3      1.000000
4      0.589391
5      0.935088
6      0.749895
7      0.291571
8      0.821795
9      0.955675
10     0.211453
11     0.502059
12     1.000000
13     0.665331
14     0.248175
15     0.094984
16     0.545955
17     0.040958
18     0.634754
19     0.000000
20     1.000000
21     0.721105
22     0.506856
23     0.547938
24     0.822837
25     0.626195
26     0.453218
27     0.000000
28     0.364005
29     0.189823
         ...   
970    0.342787
971    0.939002
972    0.700376
973    1.000000
974    0.000000
975    0.124602
976    0.272418
977    0.674846
978    0.421901
979    0.000000
980    0.355868
981    0.196049
982    0.635575
983    0.718850
984    0.269952
985    0.000000
986    0.698308
987    0.480852
988    0.544421
989    0.524010
990    0.296819
991    0.509526
992    0.064402
993    0.537203
994    0.261114
995    0.613320
996    0.596079
997    0.615850
998    0.402284
999    0.832747
Length: 1000, dtype: flo