In [389]:
import pandas as pd
import numpy as np
import nltk
import string
import re  
import spacy
import collections
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [390]:
f=open('docs.txt')
doc_str = f.read()
docs = doc_str.split(".I")

In [391]:
docs_data = []
for t in docs:
    if t.strip() != "":
        i = t.split(".W\n")[0]
        w = t.split(".W\n")[1]
        docs_data.append({"I": i.strip(), "W": w.strip()})

df_docs = pd.DataFrame(docs_data)
df_docs.head()

Unnamed: 0,I,W
0,1,correlation between maternal and fetal plasma ...
1,2,changes of the nucleic acid and phospholipid l...
2,3,surfactant in fetal lamb tracheal fluid . ...
3,4,placental and cord blood lipids.. comparison i...
4,5,free fatty acid concentration in maternal plas...


In [392]:
f=open('queries.txt')
query_str = f.read()
queries = query_str.split(".I")

In [393]:
queries_data = []
for t in queries:
    if t.strip() != "":
        i = t.split(".W\n")[0]
        w = t.split(".W\n")[1]
        queries_data.append({"I": i.strip(), "W": w.strip()})

df_queries = pd.DataFrame(queries_data)
df_queries.head()

Unnamed: 0,I,W
0,1,"the crystalline lens in vertebrates, including..."
1,2,the relationship of blood and cerebrospinal fl...
2,3,electron microscopy of lung or bronchi.
3,4,tissue culture of lung or bronchial neoplasms.
4,5,the crossing of fatty acids through the placen...


In [394]:
f=open('relevance.txt')
relevance_str = f.read()
relevance = relevance_str.strip().split("\n")

In [395]:
# Split each line into columns
rows = [list(map(float, line.strip().split())) for line in relevance]

# Create a DataFrame from the rows
df_relevance = pd.DataFrame(rows, columns=["query", "doc", "col3", "col4"])
df_relevance = df_relevance.drop(['col3', 'col4'], axis=1)

In [396]:
df_relevance = df_relevance.astype(int)
df_docs['I'] = df_docs['I'].astype(int)
df_queries['I'] = df_queries['I'].astype(int)

df_rele_doc = pd.merge(df_relevance, df_docs, left_on='doc', right_on='I')
df = pd.merge(df_rele_doc, df_queries, left_on='query', right_on='I')
df = df.rename(columns={'W_x':'docs', 'W_y':'queries'})
final_df = df[['docs', 'queries', 'doc', 'query']]

# Feature Engineering

In [397]:
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [398]:
def count_chars(text):
    return len(text)

def count_words(text):
    return len(text.split())

def count_capital_words(text):
    return len([w for w in text.split() if w.isupper()])

def count_punctuations(text):
    return len([w for w in text if w in list(string.punctuation)])

def count_words_in_quotes(text):
    x = re.findall("\'.\'|\".\"", text)
    count=0
    if x is None:
        return 0
    else:
        for i in x:
            t=i[1:-1]
            count+=count_words(t)
        return count
    
def count_sent(text):
    return len(nltk.sent_tokenize(text))

def count_paras(text):
    return len(text.split('\n'))

def count_unique_words(text):
    return len(set(text.split()))

def count_mentions(text):
    x = re.findall(r'(@w[A-Za-z0-9]*)', text)
    return len(x)

def count_stopwords(text):
    stop_words = set(stopwords.words('english'))  
    word_tokens = word_tokenize(text)
    stopwords_x = [w for w in word_tokens if w in stop_words]
    return len(stopwords_x)

# Pos Tagging

In [399]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def pos_count(text):
    nn_count = 0   #Noun
    pr_count = 0   #Pronoun
    vb_count = 0   #Verb
    jj_count = 0   #Adjective
    uh_count = 0   #Interjection
    cd_count = 0   #Numerics
    
    text = nltk.word_tokenize(text)
    text = nltk.pos_tag(text)
    
    for token in text:
            if token[1] in ['NN','NNP','NNS']:
                nn_count += 1
    if token[1] in ['PRP','PRP$']:
                pr_count += 1
    if token[1] in ['VB','VBD','VBG','VBN','VBP','VBZ']:
                vb_count += 1
    if token[1] in ['JJ','JJR','JJS']:
                jj_count += 1
    if token[1] in ['UH']:
                uh_count += 1
    if token[1] in ['CD']:
                cd_count += 1
    
    return pd.Series([nn_count, pr_count, vb_count, jj_count, uh_count, cd_count])
                
final_df[['nn_count', 'pr_count', 'vb_count', 'jj_count', 'uh_count', 'cd_count ']] = final_df['docs'].apply(pos_count)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [400]:
final_df['char_count'] = final_df['docs'].apply(lambda x:count_chars(x))
final_df['word_count'] = final_df['docs'].apply(lambda x:count_words(x))
final_df['sent_count'] = final_df['docs'].apply(lambda x:count_sent(x))
final_df['para_count'] = final_df['docs'].apply(lambda x:count_paras(x))
final_df['capital_word_count'] = final_df['docs'].apply(lambda x:count_capital_words(x))
final_df['punct_count'] = final_df["docs"].apply(lambda x:count_punctuations(x))
final_df['quoted_word_count'] = final_df['docs'].apply(lambda x:count_words_in_quotes(x))
final_df['stopword_count'] = final_df['docs'].apply(lambda x:count_stopwords(x))
final_df['unique_word_count'] = final_df['docs'].apply(lambda x:count_unique_words(x))
final_df['mention_count'] = final_df['docs'].apply(lambda x:count_mentions(x))
final_df['avg_wordlength'] = final_df['char_count']/final_df['word_count']
final_df['avg_sentlength'] = final_df['word_count']/final_df['sent_count']
final_df['unique_vs_words'] = final_df['unique_word_count']/final_df['word_count']
final_df['stopwords_vs_words'] = final_df['stopword_count']/final_df['word_count']

# Pre Processing

In [401]:
# Removing URLs
def remove_url(text):
    return re.sub(r"http\S+", "", text)

#Removing Punctuations
def remove_punct(text):
    new_text = []
    for t in text:
        if t not in string.punctuation:
            new_text.append(t)
    return ''.join(new_text)


#Tokenizer
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')



#Removing Stop words
def remove_sw(text):
    new_text = []
    for t in text:
        if t not in stopwords.words('english'):
            new_text.append(t)
    return new_text

#Lemmatizaion
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    new_text = []
    for t in text:
        lem_text = lemmatizer.lemmatize(t)
        new_text.append(lem_text)
    return new_text

In [402]:
final_df['docs'] = final_df['docs'].apply(lambda t: remove_url(t))

final_df['docs'] = final_df['docs'].apply(lambda t: remove_punct(t))

final_df['docs'] = final_df['docs'].apply(lambda t: tokenizer.tokenize(t.lower()))

final_df['docs'] = final_df['docs'].apply(lambda t: remove_sw(t))

final_df['docs'] = final_df['docs'].apply(lambda t: word_lemmatizer(t))
        
for i in range(0, len(final_df)):
    final_df.iloc[i, 0] = ' '.join(final_df.iloc[i, 0])

In [403]:
from sklearn.feature_extraction.text import TfidfVectorizer
sorted_docs = []
relevance_scores =[]

In [404]:
for i in range(1, len(df_queries)+1):

  df_ind = final_df.loc[final_df['query'] == i]
    
  vectorizer  =  TfidfVectorizer()

  train_tf_idf_features =  vectorizer.fit_transform(df_ind['docs']).toarray()

  train_tf_idf = pd.DataFrame(train_tf_idf_features)

  other_features = df_ind.drop(['docs', 'queries', 'doc', 'query'], axis=1)

  train_tf_idf.reset_index(drop=True, inplace=True)
  other_features.reset_index(drop=True, inplace=True)

  X = pd.concat([train_tf_idf, other_features], axis=1)

  # Train a simple linear regression model on the TF-IDF vectors
  
  regression = LinearRegression()
  regression.fit(X, range(len(df_ind['docs'].to_list())))

      # Predict the relevance score of each document for the given query
  prediction = regression.predict(X)

      # Sort the documents based on their relevance scores
  document_relevance = list(zip(final_df['docs'].to_list(), prediction))
  sorted_documents = sorted(document_relevance, key=lambda x: x[1], reverse=True)

  for t in sorted_documents:
    x, y = t
    sorted_docs.append(x)
    relevance_scores.append(y)
  



In [405]:
sorted_docs_df = pd.DataFrame(sorted_docs).rename(columns={0:'sorted_docs'})

In [406]:
relevance_scores_df = pd.DataFrame(relevance_scores).rename(columns={0:'relevance_scores'})

In [407]:
len(final_df)

696

In [408]:
ranking_df = pd.concat([final_df, sorted_docs_df, relevance_scores_df], axis=1)

In [409]:
ranking_df.head()

Unnamed: 0,docs,queries,doc,query,nn_count,pr_count,vb_count,jj_count,uh_count,cd_count,...,quoted_word_count,stopword_count,unique_word_count,mention_count,avg_wordlength,avg_sentlength,unique_vs_words,stopwords_vs_words,sorted_docs,relevance_scores
0,analysis mammalian lens protein electrophoresi...,"the crystalline lens in vertebrates, including...",13,1,22,0,0,0,0,0,...,0,24,47,0,7.380282,17.75,0.661972,0.338028,2627 chicken lens development epithelial cell ...,36.0
1,autoradiographic study cell migration eye lens...,"the crystalline lens in vertebrates, including...",14,1,66,0,0,0,0,0,...,0,84,111,0,6.080569,19.181818,0.526066,0.398104,1747 problem albuminoid albuminoid main consti...,35.0
2,lens development differentiation embryonic chi...,"the crystalline lens in vertebrates, including...",15,1,58,0,0,0,0,0,...,0,82,123,0,7.809524,23.333333,0.585714,0.390476,1745 deaggregation bovine lens acrystallin sed...,34.0
3,study aging horse crystalline lens gel contrib...,"the crystalline lens in vertebrates, including...",72,1,36,0,0,0,0,0,...,0,40,63,0,7.0,33.0,0.636364,0.40404,1751 subunit acrystallin acrystallin isolated ...,33.0
4,histological research lens condition hypoxia c...,"the crystalline lens in vertebrates, including...",79,1,31,0,0,0,0,0,...,0,52,56,0,6.925234,17.833333,0.523364,0.485981,1972 structural study acrystallin acrystallin ...,32.0


In [410]:
#ranking_df.to_csv('venwiz_linear_regression_rank.csv', index=False)