In [1]:
from __future__ import division
from sklearn.cluster import KMeans
from numbers import Number
from pandas import DataFrame
import sys, codecs, numpy

In [2]:
class autovivify_list(dict):
  def __missing__(self, key):
    '''Given a missing key, set initial value to an empty list'''
    value = self[key] = []
    return value

  def __add__(self, x):
    '''Override addition for numeric types when self is empty'''
    if not self and isinstance(x, Number):
      return x
    raise ValueError

  def __sub__(self, x):
    '''Also provide subtraction method'''
    if not self and isinstance(x, Number):
      return -1 * x
    raise ValueError

In [3]:
def build_word_vector_matrix(vector_file, n_words):
  '''Return the vectors and labels for the first n_words in vector file'''
  numpy_arrays = []
  labels_array = []
  with codecs.open(vector_file, 'r', 'utf-8') as f:
    for c, r in enumerate(f):
      sr = r.split()
      labels_array.append(sr[0])
      numpy_arrays.append( numpy.array([float(i) for i in sr[1:]]) )

      if c == n_words:
        return numpy.array( numpy_arrays ), labels_array

  return numpy.array( numpy_arrays ), labels_array

In [4]:
def find_word_clusters(labels_array, cluster_labels):
  '''Return the set of words in each cluster'''
  cluster_to_words = autovivify_list()
  for c, i in enumerate(cluster_labels):
    cluster_to_words[ i ].append( labels_array[c] )
  return cluster_to_words

In [5]:
input_vector_file = 'vectors.txt' 
n_words = int(1262) # Number of words to analyze 
reduction_factor = float(0.1) # Amount of dimension reduction {0,1}
n_clusters = int( n_words * reduction_factor ) # Number of clusters to make
df, labels_array = build_word_vector_matrix(input_vector_file, n_words)
kmeans_model = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
kmeans_model.fit(df)


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=126, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [6]:
cluster_labels  = kmeans_model.labels_
cluster_inertia   = kmeans_model.inertia_
cluster_to_words  = find_word_clusters(labels_array, cluster_labels)

clusters = []
for c in cluster_to_words:
    clusters.append(cluster_to_words[c])

In [7]:
clust_dict = {}
for clus in clusters:
    if len(clus) == 1:
        if 0 not in clust_dict:
            clust_dict[0] = clus
        else:
            clust_dict[0] += clus
    else:
        clust_dict[clusters.index(clus)] = clus
for key, value in clust_dict.items():
    print(key, '    ', value)

0      ['the', 'of', 'is', 'to', 'and', 'in', 'that', 'as', 'it', 'by', 'are', 'or', 'used', 'an', 'vector', 'we', 'problem', 'methods']
5      ['05', '000149611', '000254484', '00028446', '000342545', '000599943', '000717298', '000782035', '000788925', '000861262', '000908618', '000918943', '00100001', '00120731', '00134052', '00135353', '00148223', '00148554', '0015764', '00158112', '00159783', '0122e', '1804e', '28277e', '74935e', '79304e', '07', '000295394']
8      ['be', 'can']
13      ['for', 'example']
14      ['this', 'using']
16      ['probability', 'theory', 'prior']
17      ['programming', 'dynamic', 'at', 'computer', 'science']
19      ['which', 'new', 'way', 'form']
22      ['pagerank', 'on', 'page', 'links', 'other', 'pages', 'web']
23      ['term', 'if', 'occurs']
24      ['from', 'how', 'important', 'results', 'uses', 'high', 'linked', 'name', 'rank', 'sites']
25      ['classes', 'derived', 'base', 'referred']
26      ['h0k', 'he', 'ubx', 'hhk', 'hij', 'hijses']
27     

In [8]:
import os, sys, nltk, re

# Open a file
path = "DATA-NLP"
dirs = os.listdir( path )



In [9]:
question_mapping = dict() # Mapping to question dictionary
 # Mapping to student as key and answer as value

for file in dirs:
    if file == '.DS_Store':
            continue
    path_to_file=path+'/'
    file_name = file
    split_name = file.split('_')
    student_name = split_name[0]
    question_number = split_name[1].split('.')[0]
    if question_number not in question_mapping:
        question_mapping[question_number] = {}
    path_to_file += file_name
    with open(path_to_file, 'r', encoding = 'latin1') as f:
        mylist = f.read()
        sent_tokenize_list = nltk.sent_tokenize(mylist)
        temp = []
        for i in sent_tokenize_list:
            sent = i.lower()
            sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
            temp.append(sent)
        question_mapping[question_number][student_name] = temp
        

In [10]:
# Creating vector of sentences

vector_dict = {}
for key, value in question_mapping.items():
    vector_dict[key] = {}
    for student, answer in value.items():
        vector_dict[key][student] = []
        for sent in answer:
            words = sent.split(' ')
            temp_list = []
            for w in words:
                for cluster_key, cluster_value in clust_dict.items():
                    if w in clust_dict[cluster_key]:
                        temp_list.append( cluster_key)
            vector_dict[key][student].append(temp_list)
            
for k, v in vector_dict.items():
    print(k, '    ', v)
    break


taskc      {'g0pC': [[0, 0, 40, 40, 0, 32, 32, 32, 0, 0, 0, 44, 32, 69, 61, 28, 122], [32, 32, 0, 0, 32, 32, 53, 0, 85], [0, 0, 0, 46, 0, 0, 75, 0, 53, 85], [0, 0, 14, 64, 0, 0, 0, 0, 0, 32, 0, 0, 22, 0, 0, 0, 13, 14, 64, 8, 0, 43, 65, 32, 86, 39, 36], [14, 57, 55, 0, 32, 0, 39, 36, 0, 76, 0, 0, 78], [0, 32, 0, 33, 40, 0, 32, 8, 42, 8, 36, 0, 0, 46, 64, 55, 0, 32, 0, 28, 0, 43, 43, 0, 86, 32], [69, 0, 43, 0, 19, 8, 8, 0, 0, 0, 0], [69, 0, 64, 30, 14, 64, 71, 0, 43, 43, 32, 32, 0, 8, 8, 0, 0, 43, 0, 8, 8, 0, 0, 0, 14, 0, 64, 8, 66, 0, 46, 0, 22, 14], [0, 71]], 'g0pE': [[0, 84, 0, 63, 0, 36, 0, 36, 0, 27, 0, 40, 0, 42, 0, 0, 0, 40, 0, 40, 40, 0, 0, 0, 0, 38, 38, 62, 36, 22, 32, 32, 0, 32, 0, 59, 0, 84, 0, 40, 77, 0, 14, 0, 0, 71, 0, 73, 0, 36, 0, 0, 69, 0, 40, 0, 0, 32, 60]], 'g1pA': [[0, 0, 40, 40, 0, 0, 125, 40, 0, 0, 50, 125, 36, 0, 50, 30, 66, 0, 36, 0, 73, 13, 58, 73, 44], [43, 83, 86, 38, 118, 38, 38, 118, 0, 118, 118], [30, 0, 14, 40, 36, 0, 36, 0, 36], [32, 85, 85, 0, 85, 23], [0

In [11]:
vector_name = {}
rep = 0
for key, value in vector_dict.items():
    for student, vector in value.items():
        for v in vector:
            vector_name[tuple(v)] = rep
            rep+=1
        

        

In [12]:
# Creating answer vectors.

pro_dict = {}
for key, value in vector_dict.items():
    pro_dict[key] = {}
    for student, answer in value.items():
        pro_dict[key][student] = []
        for v in answer:
            ans = tuple(v)
            pro_dict[key][student].append(vector_name[ans])

for key, value in pro_dict.items():
    print(key, '    ', value)
    break
    
print(len(pro_dict.keys()))

taskc      {'g0pC': [0, 1, 2, 3, 4, 5, 6, 7, 8], 'g0pE': [9], 'g1pA': [10, 11, 12, 76, 14, 15, 16, 17, 18, 19, 20], 'g3pB': [21, 22, 23, 24, 25, 26, 27, 28, 29], 'g0pB': [30, 74, 153, 108, 137, 78, 133, 161, 82, 39, 40, 138, 42], 'g2pA': [43, 153, 45, 46, 47, 48, 49, 50, 51, 133, 53, 82, 55, 56, 57, 86, 906, 60], 'g0pD': [61, 62, 63, 64, 65, 66, 67, 68, 137, 70, 71], 'orig': [72, 153, 74, 155, 76, 137, 78, 79, 133, 161, 82, 83, 84, 85, 86, 906, 118], 'g3pC': [89, 90, 91, 92, 93, 94, 95], 'g2pB': [96, 97, 98, 99, 100, 101, 102, 103, 104, 105], 'g0pA': [106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118], 'g4pC': [119, 120, 906, 122, 906, 124, 125, 906, 127, 906, 129, 906, 131, 132], 'g4pE': [133, 161, 135, 136, 137, 138, 139, 140, 141, 142, 143], 'g2pC': [144, 145, 146, 147, 148, 149, 150, 151], 'g4pB': [152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163], 'g1pD': [164, 165, 166, 167], 'g4pD': [168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 18

In [13]:
import pyfpgrowth
inp = ["taske", "taska", "taskb", "taskc", "taskd"]

In [14]:
# performing FP_Growth Algorithm

def fp_growth(transactions):
    patterns = pyfpgrowth.find_frequent_patterns(transactions, 3)
    frequent_list = []
    for p in patterns:
        if len(p) < 3:
            continue
        else:
            frequent_list.append(list(p))
    return frequent_list


In [15]:
def is_sub(sub, lst):
    ln = len(sub)
    for i in range(len(lst) - ln + 1):
        if all(sub[j] == lst[i+j] for j in range(ln)):
            return True
    return False

In [16]:
def output(frequent_list):
    output_dict= {}
    for lst in frequent_list:
        for k, v in pro_dict.items():
            for name, vec in v.items():
                if is_sub(lst,vec):
                    if k not in output_dict:
                        output_dict[k] = [name]
                    else:
                        output_dict[k].append(name)
    return output_dict

  

In [17]:
# Final output from our script.

final= {}
for task in inp:
    transactions = []
    for k, v in pro_dict.items():
        if k == task:
            for name, vec in v.items():
                transactions.append(vec)
            frequent_list = fp_growth(transactions)
    output_dict = output(frequent_list)
    for key, value in output_dict.items():
        final[key] = set(value)
        

for i, v in final.items():
    print(i, '    ', v)


taske      {'g1pB', 'orig', 'g2pB', 'g0pE'}
taska      {'g0pE', 'orig', 'g4pC', 'g2pC', 'g3pC'}
taskd      {'g2pA', 'g4pC', 'g3pA'}


In [18]:
import pandas as pd
xl = pd.ExcelFile("corpus_final.xls")

In [19]:
df = xl.parse("File list")


In [20]:
c = df[['File','Category']]



In [21]:
# Evaluation

tp = 0
tn = 0
fn = 0
fp = 0
for key, value in final.items():
    for v in value:
        ans = v+key+'.txt'
        head = key
        for index, row in c.iterrows():
            if row['File'].split('_')[1].split('.')[0] == head:
                if row['File'].split('_')[0] == v:
                    if row['Category'] == 'cut' or row['Category'] == 'heavy':
                        tp+=1
                    else:
                        fp+=1
                else:
                    if row['Category'] == 'cut' or row['Category'] == 'heavy':
                        fn +=1
                    else:
                        tn+=1

precision = float(tp/(tp+fp))                        
print('Precision is : ', precision)
recall = float(tp/(tp+fn))
print('Accuracy is : ', (tp+tn)/(tp+fn+tn+fp))



                

Precision is :  0.9
Accuracy is :  0.6359649122807017
