In [1]:
import numpy as np
import heapq
from operator import itemgetter
import numpy.linalg as LA
import xml.etree.ElementTree as et, pandas as pd, re
from bs4 import BeautifulSoup
import gensim
from markdown import markdown
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
xtree = et.parse('Posts.xml')

xroot = xtree.getroot()

dfCols = ["Closed Date", "Favorite Count", "Comment Count", "Answer Count", "Tags", "Title",
          "Last Activity Date", "Owner User ID", "Body", "View Count", "Score", "Creation Date", "Post Type ID", 
          "ID", "Parent ID", "Last Edit Date", "Last Editor User ID", "Accepted Answer ID"]
dfRows = []

In [3]:
for node in xroot:
    closedDate = node.attrib.get("ClosedDate")
    favCount = node.attrib.get("FavoriteCount")
    commentCount = node.attrib.get("CommentCount")
    ansCount = node.attrib.get("AnswerCount")
    tags = node.attrib.get("Tags")
    title = node.attrib.get("Title")
    lastActDate = node.attrib.get("LastActivityDate")
    ownerUserID = node.attrib.get("OwnerUserId")
    body = node.attrib.get("Body")
    viewCount = node.attrib.get("ViewCount") 
    score = node.attrib.get("Score") 
    creationDate = node.attrib.get("CreationDate") 
    postTypeID = node.attrib.get("PostTypeId") 
    ID = node.attrib.get("Id") 
    parentID = node.attrib.get("ParentId") 
    lastEditDate = node.attrib.get("LastEditDate") 
    lastEditorUserID = node.attrib.get("LastEditorUserId") 
    acceptedAnswerID = node.attrib.get("AcceptedAnswerID")
    
    dfRows.append({"Closed Date": closedDate, "Favorite Count": favCount, "Comment Count": commentCount,
                     "Answer Count": ansCount, "Tags": tags, "Title": title, "Last Activity Date": lastActDate,
                     "Owner User ID": ownerUserID, "Body": body, "View Count": viewCount, "Score": score, 
                    "Creation Date": creationDate, "Post Type ID": postTypeID, "ID": ID, "Parent ID": parentID,
                    "Last Edit Date": lastEditDate, "Last Editor User ID": lastEditorUserID, "Accepted Answer ID": acceptedAnswerID})

In [4]:
out = pd.DataFrame(dfRows, columns=dfCols)

out = out.fillna(0)

out['Creation Date'] = pd.to_datetime(out['Creation Date'])
out['Creation Date'] = out['Creation Date'].dt.strftime('%Y/%m/%d')
out['Comment Count'] = out['Comment Count'].astype(int)
out['Owner User ID'] = out['Owner User ID'].astype(int)
out['Post Type ID'] = out['Post Type ID'].astype(int)
out['Score'] = out['Score'].astype(int)
out['Favorite Count'] = out['Favorite Count'].astype(int)
out['Answer Count'] = out['Answer Count'].astype(int)
out['View Count'] = out['View Count'].astype(int)

answers = out[(out['Post Type ID'] == 1)]

answers = answers[['ID','Creation Date','Tags','Title','Body']]

#words kolonu title ile bodynin birleşmiş hali, 
answers['Words'] = answers[['Title', 'Body', 'Tags']].apply(lambda x: ' '.join(x), axis=1)

answers['Words'].apply(lambda x: ''.join(BeautifulSoup(markdown(x)).findAll(text=True)))
answers.head

<bound method NDFrame.head of           ID Creation Date                                               Tags  \
0          1    2016/08/02  <neural-networks><backpropagation><terminology...   
1          2    2016/08/02  <neural-networks><machine-learning><statistica...   
3          4    2016/08/02                    <deep-network><search><neurons>   
4          5    2016/08/02                               <python><mindstorms>   
5          6    2016/08/02       <philosophy><definitions><intelligent-agent>   
6          7    2016/08/02                                <intelligent-agent>   
9         10    2016/08/02           <deep-network><terminology><fuzzy-logic>   
12        13    2016/08/02               <neural-networks><image-recognition>   
14        15    2016/08/02  <turing-test><strong-ai><intelligent-agent><we...   
15        16    2016/08/02  <deep-learning><definitions><overfitting><regu...   
16        17    2016/08/02  <philosophy><definitions><agi><superintelligen...  

In [5]:
size = len(answers.ID.to_list())

In [6]:
id_set = answers.ID.to_list() #Documents

In [7]:
def read_corpus(fname, tokens_only=False):
    for i, line in enumerate(fname):
        tokens = gensim.utils.simple_preprocess(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [int(id_set[i])])

In [8]:
trainData =answers['Words'].tolist()#[:4068]
testData =answers['Words'].tolist()[4068:]
tags = dict(zip(answers.ID.astype(int), answers.Tags))

In [9]:
train_corpus = list(read_corpus(trainData))
test_corpus = list(read_corpus(testData, tokens_only=True))

In [10]:
idTextDict = dict(zip(answers.ID, answers.Words))

In [None]:
#model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=1, epochs=200)
model = gensim.models.doc2vec.Doc2Vec(min_count=1,window=5,vector_size=300,workers=5,alpha=0.025,min_alpha=0.00025,dm=1, epochs = 50)
#model = gensim.models.doc2vec.Doc2Vec(min_count=2,window=15,vector_size=300,workers=5,alpha=0.025,min_alpha=0.00025,dm=0, epochs=100)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
doc_id = id_set.index("14119")
print(doc_id)
inferred_vector = model.infer_vector(test_corpus[doc_id-4068])
sims = model.docvecs.most_similar([inferred_vector], topn=100)
print(id_set[id_set.index("14119")])

In [None]:
print(id_set.index("13425"))
id_set[id_set.index("13425")]

In [None]:
for (item,val) in sims:
    print(item, val)

In [None]:
cosineResultDict = {}
for i in range(0, len(test_corpus)):
    inferred_vector = model.infer_vector(test_corpus[i])
    sims = model.docvecs.most_similar([inferred_vector], topn=100)
    cosineResultDict[int(id_set[4068+i])] = sims 

In [None]:
cosineResultDict[13425]

In [None]:
relatedId = {}
with open("relatedFrom13425_test.txt") as f:
    for line in f:
        (key, val) = line.split(",")
        relatedId[int(key)] = [int(i) for i in val.strip().split()]
relatedId

In [None]:
crossCheckDict = {}
bothRelatedAndSim = {}
for key in cosineResultDict.keys():
    counter = 0
    if key in relatedId.keys():
        for rel in relatedId[key]:
            for (simId,sim) in cosineResultDict[key]:
                if rel == simId:
                    if key in bothRelatedAndSim.keys():
                        bothRelatedAndSim[key].append((simId,sim))
                    else:
                        bothRelatedAndSim[key] = [(simId,sim)]

                    counter += 1
    crossCheckDict[key] = counter


In [None]:
crossCheckDict

Sıfır related matchleyen caselere örnek:
14173 - relatedla sorunun alakası yok denecek kadar az - bizim bulduklarımız similarity degerine göre daha mantıklı
14204 - hiç related yok - bizim bulduklarımız some what similar
13849 - soru ve relatedları arasında 1-2 kelime matchliyor, anlam acısından bag yok - bizim bulduklarımızın context olarak benzerliği daha fazla

In [None]:
plt.bar(range(len(crossCheckDict)), list(crossCheckDict.values()), align='center')
plt.xticks(range(len(crossCheckDict)), list(crossCheckDict.keys()))
plt.show()

In [None]:
listOfVals = crossCheckDict.values()

In [None]:
len(listOfVals)

In [None]:
sum(listOfVals)/len(listOfVals)

In [None]:
sum(listOfVals)

In [None]:
for key in bothRelatedAndSim.keys():
    print("https://ai.stackexchange.com/questions/" + str(key))
    print([i for i,j in bothRelatedAndSim[key]])

In [None]:
f = open("DOC2VEC_RELATEDDICT.txt", "a")
for key in bothRelatedAndSim:  
    f.write(str(key)+"\t"+str([i for i,j in bothRelatedAndSim[key]])+"\n")
f.close()

In [None]:
for key in crossCheckDict.keys():
    if crossCheckDict[key] == 0:
        print("https://ai.stackexchange.com/questions/" + str(key))


In [None]:
x = relatedId.keys()
bins = []

for key in x:
    bins.append(len(relatedId[key]))

width = bins[1] - bins[0]
plt.bar(x, bins, align='center', width=width)
plt.show()


In [None]:
tags[1]

In [None]:
for key in tags.keys():
    s = tags[key].replace("<", "", len(tags[key]))
    s = s.replace(">", " ", len(tags[key]))
    tags[key] = s.strip().split(" ")


In [None]:
key = 13544
print(key, relatedId[key])
print(tags[key])
for item in relatedId[key]:
    if item in tags.keys():
        print(item, tags[item])

import requests
APIKEY = "unCQQDAhgl)qZ4GZRXVVGQ((";

query = "https://api.stackexchange.com/2.2/questions/" + str(54)+"?order=desc&sort=activity&site=ai&key="+APIKEY;
response = requests.get(query)


print(response.json()["items"][0]["tags"])

Api ile websitesi eşleşiyor ama post.xml dosyasındakiler ile eşleşmiyor

import time

betterTags = {}

for id in id_set:
    if id not in betterTags.keys():
        query = "https://api.stackexchange.com/2.2/questions/" + str(id)+"?order=desc&sort=activity&site=ai&key="+APIKEY;
        response = requests.get(query)
        print(response.status_code)
        betterTags[id] = response.json()["items"][0]["tags"]
        print(id, betterTags[id])
        time.sleep(1)

betterTags

In [None]:
tagVsQuestion = {}
for key in relatedId.keys():
    #print(key, relatedId[key])
    #print(tags[key])
    true = 0
    false = 0
    for item in relatedId[key]:
        if item in tags.keys():
            #print(item, tags[item])
            #print(np.in1d(tags[key],tags[item]).any())
            if np.in1d(tags[key],tags[item]).any():
                true += 1
            else:
                false += 0
            if true+false == 0:
                tagVsQuestion[key] = 0
            else:
                print(true,false)
                tagVsQuestion[key] = float(true /(true+false))
        else:
            tagVsQuestion[key] = 0


In [None]:
tagVsQuestion[13425]

In [None]:
print([x for x in tagVsQuestion.keys() if tagVsQuestion[x] == 0])

sadec 5 tane eşleşmiyor o datalarda elimizde mevcut değil normalde çalışıyorlar

In [None]:
model.save("CQA_FRONTEND/static/data/doc2vecmodel")

In [None]:
model2 = gensim.models.doc2vec.Doc2Vec.load("doc2vecmodel")

In [None]:
inferred_vector = model.infer_vector(["asda"])
sims = model.docvecs.most_similar([inferred_vector], topn=100)

In [None]:
sims