In [None]:
# importing modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import util
import ast
import pickle
import math
from scipy.spatial.distance import cosine
import subprocess
import random

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
import pyLDAvis.gensim
from numpy import linalg as LA

from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import svm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.multiclass import OneVsRestClassifier
from keras.models import model_from_json
import warnings
from sklearn.metrics import confusion_matrix
warnings.filterwarnings('ignore')

In [2]:
df = util.get_processed_data("./../data/collections_math.csv", False)
collections = util.get_collections(df)
df.text[0]

'Solve the Linear equation Solve the Linear equation'

In [3]:
df.sort_values(by=['collection_id','sequence_id'],inplace=True)


In [4]:
lda = gensim.models.ldamodel.LdaModel
fileObject = open('./../PickelFiles/lda_science_dictionary.model','rb')  
dictionary = pickle.load(fileObject)
ldamodel = lda.load('./../PickelFiles/lda_25_sc.model')

In [5]:
# Model reconstruction from JSON file
with open('./../Final_Data/Validator_windowsize_3_archi.json', 'r') as f:
    validator = model_from_json(f.read())

# Load weights into the new model
validator.load_weights('./../Final_Data/Validator_windowsize_3_weights.h5')


In [6]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27754 entries, 3 to 27749
Data columns (total 6 columns):
collection_id    27754 non-null object
sequence_id      27754 non-null int64
resource_id      27754 non-null object
title            27754 non-null object
description      27754 non-null object
text             27754 non-null object
dtypes: int64(1), object(5)
memory usage: 1.5+ MB


In [8]:
#Removing words with length 1 since tfidf does not recognize them

for col in collections:
    lrs = []
    for i,text in enumerate(col["texts"]):
        new_text = ""
        for word in text.split():
            if len(word)>2:
                new_text+=word+" "
        if new_text.strip()!="":
            lrs.append(new_text.strip().lower())
    col["texts"] = lrs

In [9]:
# loading doc2vec ad word2vec models and functions for return embeddingsfor a paticular text

d2v= Doc2Vec.load("./../doc2vec_100dim.model")

def document_embeddings(doc):
    test_data = word_tokenize(doc.lower())
    return d2v.infer_vector(test_data)        


word2vec = Word2Vec.load('./../Word2Vec_100dim.bin')

def Word2doc(doc):
    words=doc.split()
    emb=np.zeros(100)
    for word in words:
        if(word in word2vec.wv.vocab):
            emb = np.add(emb,word2vec[word])
    if(len(doc)!=0) :       
        return emb/len(words)
    else:
        return(np.zeros(100))

In [10]:
# loading svm model

pickle_in = open("./../data/svm_model_cosine_embed_novelty_SC.sav","rb")
svm_fun = pickle.load(pickle_in)


In [11]:
# function to calculate cosin similarity for two embedding

def cosin(v1,v2):
    if(LA.norm(v1)!=0 and LA.norm(v2)!=0):
        return (np.dot(np.array(v1),np.array(v2))/(LA.norm(v1) * LA.norm(v2)))
    else:
        return 1  


In [12]:
# java -jar NoveltySemanticCoherence.jar  SinglePair

def getNoveltySC(text1,text2):
    cmd =['java','-jar','NoveltySemanticCoherence.jar','SinglePair',text1,text2]
    subprocess.call(cmd)
    f = open("./data/pairwise/singlepairwise", "r")
    sc =float(f.readline()) 
    nv = float(f.readline())
    return sc,nv

In [13]:
# ldamodel.get_document_topics(["solve"],per_word_topics=True)

In [14]:
# function to predict similarity between two text using svm model

def text_similarity_svm(text1, text2):
    
    dv1 = document_embeddings(text1)
    dv2 = document_embeddings(text2)
    
    wd1 = Word2doc(text1)
    wd2 = Word2doc(text1)
    
    ds = cosin(dv1,dv2)
    ws = cosin(wd1,wd2)

    clean_matrix1 = clean(text1).split()
    clean_matrix2 = clean(text2).split() 

    clean_matrix1 = dictionary.doc2bow(clean_matrix1)  
    clean_matrix2 = dictionary.doc2bow(clean_matrix2) 
    
    lda_1 = ldamodel.get_document_topics(clean_matrix1,per_word_topics=True)[0] 
    lda_2 = ldamodel.get_document_topics(clean_matrix2,per_word_topics=True)[0] 

#     print(lda_1)
#     lda_1 = clean_matrix1.apply(lambda x: ldamodel.get_document_topics(x,per_word_topics=True)[0])
#     lda_2 = clean_matrix1.apply(lambda x: ldamodel.get_document_topics(x,per_word_topics=True)[0])

    a_list=[]
    b_list=[]
    for i,j in lda_1:
        a_list.append(j)

    for i,j in lda_2:
        b_list.append(j)
    
    a_list=np.array(a_list)
    b_list=np.array(b_list)

    kl = np.sum(np.where(a_list!=0,a_list*np.log(a_list/b_list),0))    
    sc,nv =getNoveltySC(text1,text2)
    
    data=[]
    
    data.append(ds)
    data.append(ws)
    data.append(sc)
    data.append(nv)
    data.append(kl)
        
    similarity = svm_fun.predict_proba(np.array(data).reshape(1,5))[0][1]
    
    return similarity

In [15]:
def get_embedding(text):
    dv1 = document_embeddings(text)
    wd1 = Word2doc(text)
    clean_matrix1 = clean(text).split()
    clean_matrix1 = dictionary.doc2bow(clean_matrix1)      
    lda = ldamodel.get_document_topics(clean_matrix1,per_word_topics=True)[0] 
#     print(type(wd1))
    data = []
    data += list(wd1)
    data += list(dv1)
    for j in range(0,20):
            data += [lda[j][1]]
    return data                

In [48]:
def get_validator_score(a,b,c):
    print("entered validator checking")
    temp=[]
    frame=[]
    data1=get_embedding(a)
    data2=get_embedding(b)    
    data3=get_embedding(c)    
    
    temp.append(data1)
    temp.append(data2)
    temp.append(data3)
    frame.append(temp)
    
    score = validator.predict(np.reshape(frame,(1,3,220)))
    print("left validator checking")
    return score[0][1]

In [17]:
coll=df.collection_id.values
df.head(5)

Unnamed: 0,collection_id,sequence_id,resource_id,title,description,text
3,0008d66a-753f-4639-8634-81bb3abb3269,1,231eb4ad-d0e8-4e94-a552-f8bd2358a47a,Solve the linear equation: _______,Solve the linear equation: [1/2] ...,Solve the linear equation Solve the linear equ...
2,0008d66a-753f-4639-8634-81bb3abb3269,2,0b248202-12a9-405b-acd1-4ab8250e4198,"If , then _______","If , then [1/5]. &nbsp;Please writ...",If then If then nbsp Please write your answer ...
0,0008d66a-753f-4639-8634-81bb3abb3269,3,2ee6f80f-0851-4cfa-b4bf-2655e9c46ab7,Solve the Linear equation: _______,Solve the Linear equation: [2],Solve the Linear equation Solve the Linear equ...
4,0008d66a-753f-4639-8634-81bb3abb3269,4,81c6995c-dd95-418e-a8c4-c22d8ccd32e9,Solve the linear equation: _______,Solve the linear equation: [-18],Solve the linear equation Solve the linear equ...
1,0008d66a-753f-4639-8634-81bb3abb3269,5,15a464a4-a2d8-41dd-b00e-ff7fea0aa720,"If , then _______","If , then [-4]",If then If then


In [18]:
# function to get mean text similarity score of given corpus(data)

def get_mean_score_pair(data, score_function):
    score_sum = 0.0
    for lr1, lr2 in data:
        score_sum += score_function(lr1, lr2)
    return score_sum/len(data)

In [19]:
# list of all learning resources 

lr_list = []
col_list = [] 
count = 0
for col in collections:
    if(len(col["texts"])>6): col_list.append(col["id"])
    for lr in col["texts"]:
        if(len(lr)>0):
            lr_list.append(lr)


In [80]:
def neighbourhood_text(text,threshold):
    cos_sim = []
    return_list = []
    for i in lr_list:
            cs = cosin(Word2doc(text),Word2doc(i)) 
            cos_sim.append(cs)
            if((threshold -0.1)<cs and cs<(threshold +0.1) and i not in return_list):
                return_list.append(i)
    return return_list[:min(5,len(return_list))]                    

In [79]:
# function to generate next learning learning resource given starting and list of learning resources to select and 
# minimum similarity between them.

def get_next_lr(given_lr, lr_list, target_score, collection, score_function):
    next_lr = ""
    score_diff = 0
    for lr in lr_list:
        if lr not in collection:
            score = score_function(given_lr, lr)
            if (score - target_score) > score_diff:
                score_diff = (score - target_score)
                next_lr = lr
    return next_lr

In [105]:
# function to create a collection/pathway 

def create_collection(collection_size, start_lr, target_score, score_function):
    lr_list = neighbourhood_text(start_lr,0.7)
    collection = [start_lr]
    lr = start_lr
    if(start_lr in lr_list): 
        lr_list.remove(start_lr)
    for i in range(collection_size - 1):
        next_lr = get_next_lr(lr, lr_list, target_score, collection, score_function)
        collection.append(next_lr)
        old = lr
        lr = next_lr
        lr_list = neighbourhood_text(lr,0.7)   

        if(i>=2 & len(collection)>=2):
            print("length of collection is ",len(collection))
            print("value of i is ",i)
            sco = get_validator_score(collection[i-2],collection[i-1],collection[i])
            if(sco < 0.5 ):
                i=i-1
                collection.pop()
                lr = old
                
        if(next_lr in lr_list):
            lr_list.remove(next_lr)
        
        print(next_lr)
            
    return collection

In [106]:
# all possible pairs of consecutive resources

lr_pairs = []

for col in collections:
    for i in range(len(col["texts"]) -1):
        lr1 = col["texts"][i]
        lr2 = col["texts"][i+1]
        lr_pairs.append([lr1, lr2])

In [107]:
# all_cosine = []

# for pair in lr_pairs:
#     wv1=Word2doc(pair[0])
#     wv2=Word2doc(pair[1])
#     all_cosine.append(cosin(wv1,wv2))

# calculating mean of all consecutive learning resources
# mean = get_mean_score_pair(lr_pairs, text_similarity_svm)

In [108]:
# plt.hist(all_cosine,bins=30)
# plt.ylabel('Cosine Similarity');    
# plt.show()  

In [109]:
def sortSecond(val): 
    return val[1]

In [110]:
def generating_collection(seed_val,threshold):
    random.seed(seed_val)
    start_lr = lr_list[random.randint(0,len(lr_list))]
    print(start_lr)

#     temp_list = [] 
#     wd1 = Word2doc(start_lr)
#     for i in lr_list:
#         wd2 = Word2doc(i)
#         ws = cosin(wd1,wd2)
#         temp_list.append([i,ws])
#     temp_list.sort(key = sortSecond ,reverse = True)

#     hist = []
#     for i in temp_list:
#         hist.append(i[1])
#     plt.hist(hist,bins=60)
#     plt.ylabel('Cosine Similarity');    
#     plt.show()    
    
#     neig_list = [] 
#     for i in range(0,min(10,len(temp_list))):
#         if(temp_list[i][1]!=1 and temp_list[i][0] not in neig_list):
#             neig_list.append(temp_list[i][0])
    
    new_collection = create_collection(8, start_lr, threshold, text_similarity_svm)

    # print(new_collection)
    f = open("./../Final_Data/Genetrated collections/collection_KL_Validator_"+str(seed_val)+"_"+str(threshold),"w")
    for i in new_collection:
        f.write(i+"\n")
        f.write("--------------------------------------------------------------\n")
        print(i)
        print("----------------------------------------------------------------")

In [None]:
# generating_collection(2019,0.5)

In [112]:
generating_collection(2016,0.5)

there relationship between moisture content when straight line provides reasonable summary the relationship between two numerical variables say thatthe two variables are linearly related that there linear relationship between the two variables take look the scatter plots below and answer the questions that follow there relationship between moisture content and frying time the data points look scattered there relationship between moisture content and frying time does the relationship look linear
how can the graph how can the graph horizontally translated positive negative
draw graph made function the board draw graph made function the board labeled and show how translate right orleft units using the equation
the effects these transformations the graph there nothing special about using the function students did this lesson the effects these transformations the graph function hold true for all functions

length of collection is  6
value of i is  4
entered validator checking
left validator

IndexError: list index out of range

In [None]:
generating_collection(1,0.5)