In [1]:
from __future__ import division
from sklearn.cluster import KMeans
from numbers import Number
from pandas import DataFrame
import sys, codecs, numpy

In [2]:
class autovivify_list(dict):
  '''A pickleable version of collections.defaultdict'''
  def __missing__(self, key):
    '''Given a missing key, set initial value to an empty list'''
    value = self[key] = []
    return value

  def __add__(self, x):
    '''Override addition for numeric types when self is empty'''
    if not self and isinstance(x, Number):
      return x
    raise ValueError

  def __sub__(self, x):
    '''Also provide subtraction method'''
    if not self and isinstance(x, Number):
      return -1 * x
    raise ValueError

In [3]:
def build_word_vector_matrix(vector_file, n_words):
  '''Return the vectors and labels for the first n_words in vector file'''
  numpy_arrays = []
  labels_array = []
  with codecs.open(vector_file, 'r', 'utf-8') as f:
    for c, r in enumerate(f):
      sr = r.split()
      labels_array.append(sr[0])
      numpy_arrays.append( numpy.array([float(i) for i in sr[1:]]) )

      if c == n_words:
        return numpy.array( numpy_arrays ), labels_array

  return numpy.array( numpy_arrays ), labels_array

In [4]:
def find_word_clusters(labels_array, cluster_labels):
  '''Return the set of words in each cluster'''
  cluster_to_words = autovivify_list()
  for c, i in enumerate(cluster_labels):
    cluster_to_words[ i ].append( labels_array[c] )
  return cluster_to_words

In [7]:
input_vector_file = '/Users/varunnandu/glove/vectors.txt' # Vector file input (e.g. glove.6B.300d.txt)
n_words = int(1262) # Number of words to analyze 
reduction_factor = float(0.2) # Amount of dimension reduction {0,1}
n_clusters = int( n_words * reduction_factor ) # Number of clusters to make
df, labels_array = build_word_vector_matrix(input_vector_file, n_words)
kmeans_model = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
kmeans_model.fit(df)


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=252, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [11]:
cluster_labels  = kmeans_model.labels_
cluster_inertia   = kmeans_model.inertia_
cluster_to_words  = find_word_clusters(labels_array, cluster_labels)

clusters = []
for c in cluster_to_words:
    clusters.append(cluster_to_words[c])

In [20]:
clust_dict = {}
for clus in clusters:
    if len(clus) == 1:
        if 0 not in clust_dict:
            clust_dict[0] = clus
        else:
            clust_dict[0] += clus
    else:
        clust_dict[clusters.index(clus)] = clus
for key, value in clust_dict.items():
    print(key, '    ', value)

0      ['the', 'of', 'is', 'to', 'and', '05', 'in', 'that', 'be', 'as', 'it', 'by', 'are', 'for', 'can', 'this', 'or', 'probability', 'programming', 'used', 'an', 'vector', 'pagerank', 'on', 'from', 'classes', 'he', 'has', 'with', 'have', 'document', 'one', '06', 'inheritance', 'problems', 'optimal', 'each', 'probabilities', 'google', 'also', 'class', 'subproblems', 'number', 'terms', 'links', 'given', 'we', 'words', 'example', 'more', 'new', 'value', 'where', 'if', 'using', 'any', 'its', 'methods', 'solutions', 'dynamic', 'many', 'means', 'at', 'known', 'computer', 'objects', 'been', 'different', 'such', 'there', 'represented', 'same', 'marginal', 'because', 'program', 'substructure', 'derived', 'often', 'ways', 'based', 'certain', 'could', 'after', 'science']
20      ['which', 'way', 'form']
24      ['term', 'occurs']
28      ['h0k', 'ubx', 'hhk', 'hij', 'hijses']
30      ['all', 'common']
32      ['theorem', 'bayes']
35      ['page', 'other', 'pages', 'web']
41      ['documents', 'v

In [121]:
import os, sys, nltk, re

# Open a file
path = "/Users/varunnandu/Desktop/plagiarism-detection-nlp/DATA-NLP"
dirs = os.listdir( path )



In [133]:
question_mapping = dict() # Mapping to question dictionary
 # Mapping to student as key and answer as value

for file in dirs:
    if file == '.DS_Store':
            continue
    path_to_file='/Users/varunnandu/Desktop/plagiarism-detection-nlp/DATA-NLP/'
    file_name = file
    split_name = file.split('_')
    student_name = split_name[0]
    question_number = split_name[1].split('.')[0]
    if question_number not in question_mapping:
        question_mapping[question_number] = {}
    path_to_file += file_name
    with open(path_to_file, 'r', errors = 'ignore') as f:
        mylist = f.read()
        sent_tokenize_list = nltk.sent_tokenize(mylist)
        temp = []
        for i in sent_tokenize_list:
            sent = i.lower()
            sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
            temp.append(sent)
        question_mapping[question_number][student_name] = temp
        

In [189]:
vector_dict = {}
for key, value in question_mapping.items():
    vector_dict[key] = {}
    for student, answer in value.items():
        vector_dict[key][student] = []
        for sent in answer:
            words = sent.split(' ')
            temp_list = []
            for w in words:
                for cluster_key, cluster_value in clust_dict.items():
                    if w in clust_dict[cluster_key]:
                        temp_list.append( cluster_key)
            vector_dict[key][student].append(temp_list)


In [144]:
vector_name = {}
rep = 0
for key, value in vector_dict.items():
    for student, vector in value.items():
        for v in vector:
            vector_name[tuple(v)] = rep
            rep+=1
        

        

In [154]:
pro_dict = {}
# for key, value in vector_dict.items():
#     for student, vector in value.items():
#         for v in vector:
#             ans = tuple(v)
#             pro_dict.setdefault(student, []).append(vector_name[ans])
for key, value in vector_dict.items():
    pro_dict[key] = {}
    for student, answer in value.items():
        pro_dict[key][student] = []
        for v in answer:
            ans = tuple(v)
            pro_dict[key][student].append(vector_name[ans])

for key, value in pro_dict.items():
    print(key, '    ', value)
    
print(len(pro_dict.keys()))

taskc      {'g0pC': [0, 1, 2, 3, 4, 5, 6, 7, 8], 'g0pE': [9], 'g1pA': [10, 11, 12, 76, 14, 15, 16, 17, 18, 19, 20], 'g3pB': [21, 22, 23, 24, 25, 26, 27, 28, 29], 'g0pB': [30, 74, 153, 108, 137, 78, 133, 134, 82, 39, 40, 138, 42], 'g2pA': [43, 153, 45, 772, 47, 48, 49, 50, 51, 133, 53, 82, 55, 56, 57, 86, 909, 60], 'g0pD': [61, 62, 63, 64, 65, 66, 67, 68, 137, 70, 71], 'orig': [72, 153, 74, 155, 76, 137, 78, 79, 133, 134, 82, 83, 84, 85, 86, 909, 118], 'g3pC': [89, 90, 91, 92, 93, 94, 95], 'g2pB': [96, 97, 98, 99, 100, 101, 102, 103, 104, 105], 'g0pA': [106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118], 'g4pC': [119, 120, 909, 122, 909, 124, 750, 909, 127, 909, 129, 909, 131, 132], 'g4pE': [133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143], 'g2pC': [144, 145, 146, 147, 148, 149, 150, 151], 'g4pB': [152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163], 'g1pD': [164, 165, 166, 167], 'g4pD': [168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 1

In [176]:
import pyfpgrowth
transactions = []
inp = "taskd"

In [177]:
for k, v in pro_dict.items():
    if k == inp:
        for name, vec in v.items():
            transactions.append(vec)
print(transactions)
# 3 is the support
# for item in find_frequent_itemsets(transactions, 3):
#     if len(item) < 3:
#         continue
#     frequent_list.append(item)

# print frequent_list

[[218, 219, 220], [221, 222, 223, 224, 225, 226], [227, 228, 229, 372, 231, 232, 233], [367, 235, 236, 237, 370, 239, 240, 241], [242, 243], [244, 245, 246, 247, 248], [367, 392, 333, 369, 253, 371, 372, 373, 374, 375, 316, 386, 318, 386, 320, 378], [367, 392, 333, 369, 370, 371, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284], [360, 286, 287, 288, 386, 290, 291, 292, 293, 294, 295, 296], [297, 298, 909, 300, 301, 302, 303, 304, 305], [391, 307, 308, 369, 370, 371, 373, 313, 314, 315, 316, 386, 318, 386, 320, 378], [322, 323, 324, 325, 326, 327, 328, 329, 330], [331, 392, 333, 334, 335, 336, 337, 338, 373, 340, 341, 342, 343], [344, 345, 346, 347, 348, 354], [350, 351, 352, 353, 354, 355, 356, 357, 358, 359], [360, 361, 362, 363, 364, 365, 366], [367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378], [379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390], [391, 392, 393, 394, 395, 396, 397, 398, 399], [400, 401, 402, 403, 404, 405, 406]]


In [181]:
patterns = pyfpgrowth.find_frequent_patterns(transactions, 3)
frequent_list = []
for p in patterns:
    if len(p) < 3:
        continue
    else:
        frequent_list.append(list(p))
print(frequent_list)

[[373, 378, 386], [371, 373, 378], [371, 373, 378, 386], [371, 378, 386], [369, 371, 378], [369, 371, 378, 386], [369, 371, 373, 378], [369, 371, 373, 378, 386], [369, 373, 378, 386], [369, 378, 386], [367, 369, 371], [369, 370, 371], [369, 371, 386], [371, 373, 386], [369, 371, 373], [369, 371, 373, 386], [369, 373, 386]]


In [186]:
def is_sub(sub, lst):
    ln = len(sub)
    for i in range(len(lst) - ln + 1):
        if all(sub[j] == lst[i+j] for j in range(ln)):
            return True
    return False

In [187]:
output_dict= {}
for lst in frequent_list:
    for k, v in pro_dict.items():
        for name, vec in v.items():
            if is_sub(lst,vec):
                if k not in output_dict:
                    output_dict[k] = [name]
                else:
                    output_dict[k].append(name)

print(output_dict)
for k, v in output_dict.items():
    for item in v:
        print('Question ' + k + ' was copied from ' + item)

{'taskd': ['g2pA', 'g4pC', 'g3pA']}
Question taskd was copied from g2pA
Question taskd was copied from g4pC
Question taskd was copied from g3pA
