In [1]:
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import gensim
from gensim import corpora, models, similarities
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#prepare data:
with open("cran.1400", "r") as file:
    #data = file.readlines()
    data = file.read()
#Extraction des documents to docs
docs = data.split(r'.I')

titles = []
texts = []
for doc in docs:
    if not doc:
        continue
    #extract headers---------
    titles.append( doc.split(r'.T')[1].split(r'.A')[0][1:] )
    #extract text------------
    texts.append( doc.split(r'.W')[1][1:] )

documents = texts

# Figure out what is happening here. You have already seen this.


df = pd.DataFrame({'title': titles , 'text' : texts})
df.head()

# #############################################################

Unnamed: 0,title,text
0,experimental investigation of the aerodynamics...,experimental investigation of the aerodynamics...
1,simple shear flow past a flat plate in an inco...,simple shear flow past a flat plate in an inco...
2,the boundary layer in simple shear flow past a...,the boundary layer in simple shear flow past a...
3,approximate solutions of the incompressible la...,approximate solutions of the incompressible la...
4,one-dimensional transient heat conduction into...,one-dimensional transient heat conduction into...


In [3]:
def tokenize_regex_punct_keep( text):
    # delete all non words exept html tags
    text = re.sub('[^\w<>]',' ',text)
    # delete javascript tags
    text =re.sub('< *script*>.*?< *script*>',' ',text)
    # delete all html tags
    text = re.sub('<.*?>',' ',text)
    # delete numbers 
    text = re.sub("[0-9><,]+"," ",text)
    # delete reteur a la ligne
    text = re.sub(r"\n+"," ",text)
    # replace multiple spaces with one space
    text = re.sub(r"\s+"," ",text)
    # transfer text to lowercase
    text = text.lower() 
    # tokenaze text
    tokens = re.split(" ", text)
    # Remove stop words        
    stop_words = stopwords.words('english')
    tokens = [word for word in tokens if word not in stop_words and word]
    
    return tokens

tokens = []
for doc in texts:
    tokens.append(tokenize_regex_punct_keep(doc))

df = pd.DataFrame({'tokens':tokens})
df.head()

Unnamed: 0,tokens
0,"[experimental, investigation, aerodynamics, wi..."
1,"[simple, shear, flow, past, flat, plate, incom..."
2,"[boundary, layer, simple, shear, flow, past, f..."
3,"[approximate, solutions, incompressible, lamin..."
4,"[one, dimensional, transient, heat, conduction..."


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
#clean = data.replace('.T\n','\n').replace('.A\n','\n').replace('.B\n','\n').replace('.W\n','\n').split('\n.I ')
vectoriser = CountVectorizer()
countin = vectoriser.fit_transform(texts)
print(countin.toarray())
#vectoriser.get_feature_names()

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [5]:
vectoriser2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
countin2 = vectoriser2.fit_transform(texts)
#print(vectoriser2.get_feature_names())
print(countin2.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [6]:
# Figure out what is happening here. You have already seen this.
Vcount = TfidfVectorizer(analyzer='word', ngram_range=(1,1), stop_words = 'english')
countMatrix = Vcount.fit_transform(texts)
print(countMatrix.shape)
print(countMatrix.toarray())
#print(Vcount.get_feature_names())

(1400, 7185)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [7]:
print(countMatrix)

  (0, 2608)	0.07798045187624733
  (0, 1526)	0.08621934766393637
  (0, 6075)	0.08079953859335902
  (0, 2302)	0.04720764923483853
  (0, 2376)	0.09251143301007075
  (0, 6517)	0.04180848467846198
  (0, 2824)	0.03248958241451342
  (0, 4958)	0.08148637342301955
  (0, 571)	0.09130920837735514
  (0, 6307)	0.1374191300544049
  (0, 5449)	0.1162935508189057
  (0, 3550)	0.09190092678436508
  (0, 2297)	0.05060801151796442
  (0, 1643)	0.08492249695883744
  (0, 3835)	0.04338098842506674
  (0, 1072)	0.04060356874680492
  (0, 1988)	0.4122573901632147
  (0, 5084)	0.09130920837735514
  (0, 3421)	0.2551925902001386
  (0, 6297)	0.10406391237155205
  (0, 5889)	0.08713520317832768
  (0, 2525)	0.09379386351883931
  (0, 6370)	0.11426753859896151
  (0, 1790)	0.07798045187624733
  (0, 3931)	0.07201638012796313
  :	:
  (1399, 3465)	0.21436958908399065
  (1399, 3639)	0.08702433007099791
  (1399, 4882)	0.24672214821322933
  (1399, 1117)	0.21519428379708375
  (1399, 3957)	0.2579265836262177
  (1399, 6248)	0.14521812

In [8]:
# ### Loading the sample queries

cranQuery = open('cran.qry').read().replace('.W\r','').replace('\n.W\n',' ').split('.I ')[1:]

cranQuery[0]
queryDict = dict()
queryVects = dict()

df = pd.DataFrame({'query': cranQuery})
df.head()

Unnamed: 0,query
0,001 what similarity laws must be obeyed when c...
1,002 what are the structural and aeroelastic pr...
2,004 what problems of heat conduction in compos...
3,008 can a criterion be developed to show empir...
4,009 what chemical kinetic system is applicable...


In [9]:
for item in cranQuery:
    stuff = item.split('\r\n\n')
    queryDict[stuff[0]] = stuff[-1].strip('\r\n').replace('\r',' ')
    queryVects[stuff[0]] = Vcount.transform([stuff[-1].strip('\r\n').replace('\r',' ')])
#df = pd.DataFrame({'queryDict:':queryDict})
#df1 = pd.DataFrame({'queryDict:':queryVects})
#df1


In [10]:
# ### Loading the Query relevance Judgements

from collections import defaultdict
queryRel = open('cranqrel').read().split('\n')

queryRelDict = defaultdict(dict)
for item in queryRel:
    stuff = item.split()
    try:
        queryRelDict[stuff[0]][stuff[2]].append(stuff[1])
    except:
        queryRelDict[stuff[0]][stuff[2]] = list()
        queryRelDict[stuff[0]][stuff[2]].append(stuff[1])
        
#df = pd.DataFrame({'query rel': queryRel})
#df.head()
#print(queryVects['001 what similarity laws must be obeyed when constructing aeroelastic models\nof heated high speed aircraft .\n'])

In [11]:
# Query 1: "what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft"
from sklearn.metrics.pairwise import cosine_similarity

cosMattf = cosine_similarity(queryVects['001 what similarity laws must be obeyed when constructing aeroelastic models\nof heated high speed aircraft .\n'],countMatrix)
related_docs_indices = cosMattf[0].argsort()[:-11:-1]


for item in related_docs_indices:
    print ('Document', item+1, cosMattf[0][item])

tp = list()
for item in queryRelDict['1'].keys():
    for stuff in related_docs_indices:
        if str(stuff+1)in queryRelDict['1'][item]:
            tp.append(stuff+1)

Document 13 0.24698762534357854
Document 184 0.24314870518096476
Document 12 0.21496361584522503
Document 51 0.15919875853269266
Document 486 0.15845921423443213
Document 327 0.12273934220419946
Document 878 0.12222010551096335
Document 1268 0.12170987087140378
Document 435 0.11776747449291272
Document 686 0.11482796252361308


In [12]:
# ### Precision
print (tp)

#All other entries which are in related_docs_indices but not in queryRelDict['1'] are false positives
precision = 1.0*len(tp)/len(related_docs_indices)

print ('Precision is', precision)

[184, 12, 51, 13, 486]
Precision is 0.5


In [13]:
# ### Recall


recallDocLen = 0
for item in queryRelDict['1'].keys():
    recallDocLen += len(queryRelDict['1'][item])
    
print (recallDocLen)

recall = 1.0*len(tp)/recallDocLen

print ('recall is', recall)

29
recall is 0.1724137931034483


In [14]:
# ### Precision and Recall @ K

# change the value for k
k = 30

cosMattf = cosine_similarity(queryVects['001 what similarity laws must be obeyed when constructing aeroelastic models\nof heated high speed aircraft .\n'],countMatrix)
related_docs_indices = cosMattf[0].argsort()[:-1*(k+1):-1]

tp = list()
for item in queryRelDict['1'].keys():
    for stuff in related_docs_indices:
        if str(stuff+1)in queryRelDict['1'][item]:
            tp.append(stuff+1)
            
precision = 1.0*len(tp)/len(related_docs_indices)
recall = 1.0*len(tp)/recallDocLen

print ('Precision is', precision)
print ('Recall is', recall)

Precision is 0.23333333333333334
Recall is 0.2413793103448276


In [15]:
#Average Precision @ Key

relOrNot = [0]*k
for item in queryRelDict['1'].keys():
    for i in range(len(related_docs_indices)):
        if str(related_docs_indices[i]+1) in queryRelDict['1'][item]:
            relOrNot[i] = 1       
            
print (relOrNot)
avgPs = list()

for i in range(len(relOrNot)):
    if relOrNot[i] == 1:
        print ('P@',i+1,' : ',sum(relOrNot[:i+1])*1.0/(i+1))
        avgPs.append(sum(relOrNot[:i+1])*1.0/(i+1))

        
print ('Average Precision @ K for query 1 :', sum(avgPs)/len(avgPs))

[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
P@ 1  :  1.0
P@ 2  :  1.0
P@ 3  :  1.0
P@ 4  :  1.0
P@ 5  :  1.0
P@ 13  :  0.46153846153846156
P@ 18  :  0.3888888888888889
Average Precision @ K for query 1 : 0.8357753357753358


In [16]:
import gensim # commonly used for semantic analysis, topic modeling and similarity analysis.
import nltk
# tokenize corpora
TOKENIZED_CORPUS = [nltk.word_tokenize(sentence) for sentence in texts]
tokenized_new_doc = [nltk.word_tokenize(sentence) for sentence in cranQuery]
# build the word2vec model on our training corpus
model = gensim.models.Word2Vec(TOKENIZED_CORPUS, size=5, window=2, min_count=1)
print (model.wv['determine'])
print (model.wv['problem'])

[-1.3515753  -0.5364526  -0.02669794 -2.3212914   0.58708036]
[-2.5638003   0.555126   -0.71360904 -2.0995162   1.3016542 ]


In [17]:
from gensim import corpora
dict = corpora.Dictionary(tokens)
print(dict)
print(dict.num_docs)

Dictionary(6926 unique tokens: ['aerodynamics', 'agree', 'angles', 'attack', 'basis']...)
1400


In [29]:
# la methode : doc2bow(document,...)  kt7awel les doc --> l' vector d les frequense des mots
corpus_doc2bow_vectors = [dict.doc2bow(tok_doc) for tok_doc in tokens]
corpus_doc2bow_vectors[:1]
#for c in corpus_doc2bow_vectors:
    #c

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 3),
  (11, 1),
  (12, 3),
  (13, 1),
  (14, 2),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 2),
  (19, 1),
  (20, 1),
  (21, 2),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 2),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 4),
  (32, 1),
  (33, 2),
  (34, 1),
  (35, 2),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 5),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 3)]]

In [30]:
tfidf_model = models.TfidfModel(corpus_doc2bow_vectors, id2word=dict, normalize=False)
corpus_tfidf_vectors = tfidf_model[corpus_doc2bow_vectors]
print("\n\ntf-idf")
docvector = []
for doc_vector in corpus_tfidf_vectors:
    #print(doc_vector)
    docvector.append(doc_vector)
df1 = pd.DataFrame({'doc vector avec tfidf ': docvector})
df1.head()



tf-idf


Unnamed: 0,doc vector avec tfidf
0,"[(0, 5.866248611111173), (1, 5.451211111832329..."
1,"[(5, 8.02860530443977), (12, 3.723290657269129..."
2,"[(5, 3.211442121775908), (22, 1.99176778305553..."
3,"[(5, 8.02860530443977), (13, 2.520473774269442..."
4,"[(30, 1.81458649128868), (69, 2.56246786293407..."


In [31]:
with open('cran.qry','r') as file :
    qwery_file = file.read()
qwery_file = qwery_file.split(r'.I')
qwery_file = [ q.split(r'.W')[1][1:] for q in qwery_file if q ]

tokens_q = [ tokenize_regex_punct_keep(q) for q in qwery_file ]
query_zouina = []
for i in tokens_q:
    query_zouina.append(i)

df =  pd.DataFrame(query_zouina)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,similarity,laws,must,obeyed,constructing,aeroelastic,models,heated,high,speed,...,,,,,,,,,,
1,structural,aeroelastic,problems,associated,flight,high,speed,aircraft,,,...,,,,,,,,,,
2,problems,heat,conduction,composite,slabs,solved,far,,,,...,,,,,,,,,,
3,criterion,developed,show,empirically,validity,flow,solutions,chemically,reacting,gas,...,instantaneous,local,chemical,equilibrium,,,,,,
4,chemical,kinetic,system,applicable,hypersonic,aerodynamic,problems,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,papers,applicable,problem,calculation,procedures,laminar,incompressible,flow,arbitrary,pressure,...,,,,,,,,,,
221,anyone,investigated,shear,buckling,stiffened,plates,,,,,...,,,,,,,,,,
222,papers,shear,buckling,unstiffened,rectangular,plates,shear,,,,...,,,,,,,,,,
223,practice,close,reality,assumptions,flow,hypersonic,shock,tube,using,nitrogen,...,,,,,,,,,,


In [32]:
df = pd.DataFrame({'query tokens ': tokens_q})
df.head()

Unnamed: 0,query tokens
0,"[similarity, laws, must, obeyed, constructing,..."
1,"[structural, aeroelastic, problems, associated..."
2,"[problems, heat, conduction, composite, slabs,..."
3,"[criterion, developed, show, empirically, vali..."
4,"[chemical, kinetic, system, applicable, hypers..."


In [33]:
# Create a bow vector for a new document (for exmaple: a query)
#query = tokens_q[0]
#query_bow_vector = dict.doc2bow(query)
#print(query_bow_vector)
#type(query_bow_vector)
queryBowVector = []
for query1 in tokens_q:
    queryBowVector = dict.doc2bow(query1)
df = pd.DataFrame({'query bow: ': queryBowVector})
df

Unnamed: 0,query bow:
0,"(8, 1)"
1,"(31, 1)"
2,"(40, 1)"
3,"(222, 1)"
4,"(224, 1)"
5,"(356, 1)"
6,"(417, 1)"
7,"(420, 1)"
8,"(867, 1)"


In [34]:
# Create a bow vector for a new document (for exmaple: a query)
query = tokens_q[0]
query_bow_vector = dict.doc2bow(query)
#df = pd.DataFrame({'query to vec' : query_bow_vector})
#df
print(query_bow_vector)

[(79, 1), (88, 1), (111, 1), (405, 1), (407, 1), (457, 1), (462, 1), (469, 1), (950, 1), (1096, 1)]


In [35]:
# Calculate (compute) TF-IDF vector of the query
query_tfidf_vector = tfidf_model[query_bow_vector]
print(query_tfidf_vector)

[(79, 2.574694165267329), (88, 4.836501267717121), (111, 2.975477680865931), (405, 6.45121111183233), (407, 4.301463992327647), (457, 5.643856189774724), (462, 6.866248611111173), (469, 4.807354922057604), (950, 4.451211111832329), (1096, 8.129283016944965)]


In [36]:
index_matrix = similarities.SparseMatrixSimilarity(corpus_tfidf_vectors, num_features= 6926 ) 

sims = index_matrix[query_tfidf_vector]

print(type(sims))
# print(list(enumerate(sims)))

sims_id = list(enumerate(sims))
sims_id.sort(key=lambda tup: tup[1], reverse=True)
# sims_id = sorted(list(enumerate(sims)), key=lambda tup: tup[1])
sims_id = sims_id[:10]
# print(sims_id[0])

d = { 'Query' : [ qwery_file[0] for _ in range(10) ], \
'doc_id' : [ s[0]+1 for s in  sims_id ], \
'Document' : [ documents[ s[0] ] for s in  sims_id ], \
'Cosined' :  [ s[1] for s in  sims_id ]}

df2 = pd.DataFrame(d)
df2.head(10)

<class 'numpy.ndarray'>


Unnamed: 0,Query,doc_id,Document,Cosined
0,what similarity laws must be obeyed when const...,13,similarity laws for stressing heated wings .\n...,0.265568
1,what similarity laws must be obeyed when const...,184,scale models for thermo-aeroelastic research ....,0.249552
2,what similarity laws must be obeyed when const...,12,some structural and aerelastic considerations ...,0.191958
3,what similarity laws must be obeyed when const...,51,theory of aircraft structural models subjected...,0.155191
4,what similarity laws must be obeyed when const...,486,similarity laws for aerothermoelastic testing ...,0.151618
5,what similarity laws must be obeyed when const...,1268,stable combustion of a high-velocity gas in a ...,0.130805
6,what similarity laws must be obeyed when const...,327,on local flat plate similarity in the hyperson...,0.128621
7,what similarity laws must be obeyed when const...,435,application of similar solutions to calculatio...,0.114283
8,what similarity laws must be obeyed when const...,746,aeroelastic problems in connection with high s...,0.114219
9,what similarity laws must be obeyed when const...,875,models for aeroelastic investigation .\n this...,0.111453
