In [1]:
# Reference used: https://www.geeksforgeeks.org/tf-idf-for-bigrams-trigrams/
import nltk 
import re 
import math
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import normalize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.util import ngrams
import pandas as pd   
import jsonlines
import json
from tqdm.notebook import tqdm
import pickle

In [2]:
documents = []
docID = {}
term_total_citations = {}
author_term_citations = {}
idx = 0
with jsonlines.open('data_india_sample.jl') as reader:
    for obj in tqdm(reader, leave=False):
        documents.append(obj)
        docID[idx] = obj['user']
        author_term_citations[obj['user']] = {}
        idx += 1

|          | 0/? [00:00<?, ?it/s]

In [3]:
# Code Taken From: https://www.geeksforgeeks.org/tf-idf-for-bigrams-trigrams/
def remove_string_special_characters(s):
    # removes special characters with ' '
    stripped = re.sub('[^a-zA-z\s]', ' ', s)
    stripped = re.sub('_', ' ', stripped)
      
    # Change any white space to one space
    stripped = re.sub('\s+', ' ', stripped)
      
    # Remove start and end white spaces
    stripped = stripped.strip()
    if stripped != '':
            return stripped.lower()

In [4]:
# Code Reference: https://www.geeksforgeeks.org/python-stemming-words-with-nltk/
def stem_string(sentence):
    ps = PorterStemmer()
    
    words = word_tokenize(sentence)
    words = [ps.stem(word) for word in words]
    return " ".join(words)

In [5]:
def remove_stop_words(sentence):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

In [6]:
def process_string(sentence):
    sentence = remove_string_special_characters(sentence)
    sentence = stem_string(sentence)
    sentence = remove_stop_words(sentence)
    return sentence

### TF-IDF Matrix

In [7]:
def update_term_citations(document, sentence, numCitations, userID):
    numCitations += 1
    sentence = sentence.lower()
    sentence = re.sub(r'[^a-zA-Z0-9\s]', ' ', sentence)
    tokens = [token for token in sentence.split(" ") if token != ""]
    for n in range(1, 5):
#         seen_ngms = []
        ngms = list(ngrams(tokens, n))
        for ngm in ngms:
            gm = " ".join(ngm)
#             if gm in seen_ngms:
#                 continue
#             seen_ngms.append(gm)
            if gm in author_term_citations[userID]:
                author_term_citations[userID][gm] += numCitations
            else:
                author_term_citations[userID][gm] = numCitations
            if gm in term_total_citations:
                term_total_citations[gm] += numCitations
            else:
                term_total_citations[gm] = numCitations

In [8]:
word_documents = []
for document in tqdm(documents, leave=False):
    sentences = []
    for subject in document['subjects']:
        sentence = process_string(subject)
        sentences.append(sentence)
        update_term_citations(document, sentence, int(document['citationsAll']), document['user'])
    for paper in document['papers']:
        try:
            sentence = paper['title'] + " " + paper['conference']
            sentence = process_string(sentence)
        except: 
            continue
#         print(sentence)
#         print(sentence)
        update_term_citations(document, sentence, paper['citations'], document['user'])
        sentences.append(sentence)
#     word_documents.append(sentences)
    word_documents.append(".\n".join(sentences))
#     print(word_documents[-1])
#     word_documents.append(sentences)
#     print(".".join(sentences))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [9]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,4),
                             , min_df=0, analyzer='word')
# vectorizer vectorizeridfVectorizer()
tfidf_matrix = vectorizer.fit_transform(word_documents) 
# print(type(tfidf_matrix[0]))
# print("HIHIHI")
# print(vectorizer.idf_)
features = (vectorizer.get_feature_names())
# print(len(features))
# print(features)
# print(tfidf_matrix.shape)
# print(tfidf_matrix)
#     print(vectorizer.toarray())

In [10]:
# print(tfidf_matrix)
# print(tfidf_matrix[(0, 3235)])

In [11]:
# print(author_term_citations)
# feature_idx = {}
# for idx, feature in enumerate(features):
#     feature_idx[feature] = idx
# author_idx = 0
# for author, author_dict in tqdm(author_term_citations.items(), leave=False):
#     for term, term_citations in author_dict.items():
#         if term in feature_idx:
#             term_idx = feature_idx[term]
#             log_factor = 1 + math.log( 1 + ((term_citations) / (term_total_citations[term] + 1)) , 2)
# #             print(log_factor)
#             tfidf_matrix[(author_idx, term_idx)] *= log_factor
#     author_idx += 1

In [12]:
# tfidf_matrix = normalize(tfidf_matrix, norm='l2', axis=0)

### Pickle the Matrix and Vectorizer

In [13]:
pickle.dump(vectorizer, open("web_data/tfidf_vectorizer.pkl", "wb"))
pickle.dump(tfidf_matrix, open("web_data/tfidf_matrix.pkl", "wb"))
pickle.dump(docID, open("web_data/doc_id_dict.pkl", "wb"))

### Query Subjects

In [14]:
query="data mining"
# print(features[3730])
query = process_string(query)
result_matrix = vectorizer.transform([query])
print(result_matrix)
cosine_similarities = linear_kernel(result_matrix, tfidf_matrix).flatten()
print(len(cosine_similarities))
related_docs_indices = cosine_similarities.argsort()[:-10:-1]
print(related_docs_indices)
related_results = [docID[i] for i in related_docs_indices]
print(related_results)
# print(result_matrix)

  (0, 83601)	1.0
1000
[848 277 234 555 805 157 744 989 158]
['ZLOm0jEAAAAJ', 'wM-ma0MAAAAJ', '2oe3sXwAAAAJ', 'vcngn2gAAAAJ', 'ag3m3Y8AAAAJ', 'fREQGZkAAAAJ', 'hPvt6d8AAAAJ', 'b4hhMpwAAAAJ', 'u2h2WiMAAAAJ']


### Activeness over 'n' years

In [15]:
for doc in doccuments:
    
    
    
    

SyntaxError: unexpected EOF while parsing (<ipython-input-15-36ec8c92a847>, line 5)

### Pickled Testing

In [None]:
loaded_vectorizer = pickle.load(open("web_data/tfidf_vectorizer.pkl","rb"))
loaded_matrix = pickle.load(open("web_data/tfidf_matrix.pkl","rb"))
query="data mining"
# print(features[3730])
query = process_string(query)
result_matrix = loaded_vectorizer.transform([query])
cosine_similarities = linear_kernel(result_matrix, loaded_matrix).flatten()
related_docs_indices = cosine_similarities.argsort()[:-10:-1]
related_results = [docID[i] for i in related_docs_indices]
print(related_results)