In [1]:
from __future__ import division
from sklearn.cluster import KMeans
from numbers import Number
from pandas import DataFrame
import sys, codecs, numpy

In [2]:
class autovivify_list(dict):
  '''A pickleable version of collections.defaultdict'''
  def __missing__(self, key):
    '''Given a missing key, set initial value to an empty list'''
    value = self[key] = []
    return value

  def __add__(self, x):
    '''Override addition for numeric types when self is empty'''
    if not self and isinstance(x, Number):
      return x
    raise ValueError

  def __sub__(self, x):
    '''Also provide subtraction method'''
    if not self and isinstance(x, Number):
      return -1 * x
    raise ValueError

In [3]:
def build_word_vector_matrix(vector_file, n_words):
  '''Return the vectors and labels for the first n_words in vector file'''
  numpy_arrays = []
  labels_array = []
  with codecs.open(vector_file, 'r', 'utf-8') as f:
    for c, r in enumerate(f):
      sr = r.split()
      labels_array.append(sr[0])
      numpy_arrays.append( numpy.array([float(i) for i in sr[1:]]) )

      if c == n_words:
        return numpy.array( numpy_arrays ), labels_array

  return numpy.array( numpy_arrays ), labels_array

In [4]:
def find_word_clusters(labels_array, cluster_labels):
  '''Return the set of words in each cluster'''
  cluster_to_words = autovivify_list()
  for c, i in enumerate(cluster_labels):
    cluster_to_words[ i ].append( labels_array[c] )
  return cluster_to_words

In [5]:
input_vector_file = '/Users/varunnandu/glove/vectors.txt' # Vector file input (e.g. glove.6B.300d.txt)
n_words = int(1262) # Number of words to analyze 
reduction_factor = float(0.2) # Amount of dimension reduction {0,1}
n_clusters = int( n_words * reduction_factor ) # Number of clusters to make
df, labels_array = build_word_vector_matrix(input_vector_file, n_words)
kmeans_model = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
kmeans_model.fit(df)


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=252, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [6]:
cluster_labels  = kmeans_model.labels_
cluster_inertia   = kmeans_model.inertia_
cluster_to_words  = find_word_clusters(labels_array, cluster_labels)

clusters = []
for c in cluster_to_words:
    clusters.append(cluster_to_words[c])

In [7]:
clust_dict = {}
for clus in clusters:
    if len(clus) == 1:
        if 0 not in clust_dict:
            clust_dict[0] = clus
        else:
            clust_dict[0] += clus
    else:
        clust_dict[clusters.index(clus)] = clus
for key, value in clust_dict.items():
    print(key, '    ', value)

0      ['the', 'of', 'is', 'to', 'and', '05', 'in', 'that', 'be', 'as', 'it', 'by', 'are', 'for', 'can', 'this', 'or', 'probability', 'used', 'which', 'an', 'vector', 'pagerank', 'on', 'from', 'classes', 'he', 'has', 'with', 'have', 'page', 'one', 'inheritance', 'each', 'probabilities', 'google', 'also', 'class', 'subproblems', 'number', 'links', 'given', 'we', 'these', 'between', 'more', 'new', 'where', 'if', 'using', 'any', 'methods', 'means', 'at', 'two', 'computer', 'objects', 'been', 'values', 'no', 'there', 'represented', 'same', 'they', 'substructure', 'another', 'derived', 'often', 'based', 'optimization', 'mathematical', 'after', 'models']
18      ['programming', 'dynamic']
24      ['term', 'occurs']
28      ['h0k', 'ubx', 'hhk', 'hij', 'hijses']
30      ['all', 'common']
32      ['theorem', 'bayes']
36      ['document', 'query']
38      ['06', '000464908', '000479967', '00152028']
40      ['problems', 'method', 'solving']
41      ['documents', 'vectors']
42      ['optimal', '

In [8]:
import os, sys, nltk, re

# Open a file
path = "/Users/varunnandu/Desktop/plagiarism-detection-nlp/DATA-NLP"
dirs = os.listdir( path )



In [9]:
question_mapping = dict() # Mapping to question dictionary
 # Mapping to student as key and answer as value

for file in dirs:
    if file == '.DS_Store':
            continue
    path_to_file='/Users/varunnandu/Desktop/plagiarism-detection-nlp/DATA-NLP/'
    file_name = file
    split_name = file.split('_')
    student_name = split_name[0]
    question_number = split_name[1].split('.')[0]
    if question_number not in question_mapping:
        question_mapping[question_number] = {}
    path_to_file += file_name
    with open(path_to_file, 'r', errors = 'ignore') as f:
        mylist = f.read()
        sent_tokenize_list = nltk.sent_tokenize(mylist)
        temp = []
        for i in sent_tokenize_list:
            sent = i.lower()
            sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
            temp.append(sent)
        question_mapping[question_number][student_name] = temp
        

In [10]:
vector_dict = {}
for key, value in question_mapping.items():
    vector_dict[key] = {}
    for student, answer in value.items():
        vector_dict[key][student] = []
        for sent in answer:
            words = sent.split(' ')
            temp_list = []
            for w in words:
                for cluster_key, cluster_value in clust_dict.items():
                    if w in clust_dict[cluster_key]:
                        temp_list.append( cluster_key)
            vector_dict[key][student].append(temp_list)
            
for k, v in vector_dict.items():
    print(k, '    ', v)
    break


taskc      {'g0pC': [[0, 0, 46, 46, 0, 0, 0, 36, 0, 0, 0, 52, 0, 0, 126, 0, 230], [0, 36, 0, 0, 0, 0, 73, 0, 236], [0, 0, 0, 90, 0, 0, 125, 0, 73, 236], [0, 0, 0, 134, 0, 0, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 131, 0, 0, 153, 229, 0, 157, 0, 41], [0, 77, 85, 0, 0, 0, 0, 41, 0, 120, 0, 0, 134], [0, 36, 0, 82, 46, 0, 36, 0, 0, 0, 0, 0, 0, 90, 131, 85, 0, 36, 0, 0, 0, 153, 139, 0, 157, 36], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 119, 0, 0, 134, 120, 0, 153, 139, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 131, 0, 109, 0, 90, 0, 0, 0], [0, 111]], 'g0pE': [[0, 117, 0, 118, 0, 41, 0, 41, 0, 30, 0, 46, 0, 81, 0, 0, 0, 46, 0, 46, 46, 0, 0, 0, 0, 44, 44, 107, 41, 0, 36, 36, 0, 36, 0, 86, 0, 245, 0, 46, 241, 0, 0, 0, 0, 232, 0, 98, 0, 41, 0, 0, 0, 0, 46, 0, 0, 36, 152]], 'g1pA': [[0, 0, 46, 46, 0, 0, 239, 46, 0, 0, 227, 239, 41, 0, 0, 0, 95, 0, 41, 0, 98, 0, 140, 248, 52], [66, 132, 242, 44, 240, 44, 44, 240, 0, 240, 223], [0, 0, 0, 46, 41, 0, 0, 0, 41], [0, 236, 236, 0, 236, 24], [0, 6

In [11]:
vector_name = {}
rep = 0
for key, value in vector_dict.items():
    for student, vector in value.items():
        for v in vector:
            vector_name[tuple(v)] = rep
            rep+=1
        

        

In [12]:
pro_dict = {}
for key, value in vector_dict.items():
    pro_dict[key] = {}
    for student, answer in value.items():
        pro_dict[key][student] = []
        for v in answer:
            ans = tuple(v)
            pro_dict[key][student].append(vector_name[ans])

for key, value in pro_dict.items():
    print(key, '    ', value)
    break
    
print(len(pro_dict.keys()))

taskc      {'g0pC': [0, 1, 2, 3, 4, 5, 6, 7, 8], 'g0pE': [9], 'g1pA': [10, 11, 12, 76, 14, 15, 16, 17, 18, 19, 20], 'g3pB': [21, 22, 23, 24, 25, 26, 27, 28, 29], 'g0pB': [30, 74, 153, 108, 137, 78, 133, 134, 82, 39, 40, 138, 42], 'g2pA': [43, 153, 45, 46, 47, 48, 49, 50, 51, 133, 53, 82, 55, 56, 57, 86, 909, 60], 'g0pD': [61, 62, 63, 64, 65, 66, 67, 68, 137, 70, 71], 'orig': [72, 153, 74, 155, 76, 137, 78, 79, 133, 134, 82, 83, 84, 85, 86, 909, 118], 'g3pC': [89, 90, 91, 92, 93, 94, 95], 'g2pB': [96, 97, 98, 99, 100, 101, 102, 103, 104, 105], 'g0pA': [106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118], 'g4pC': [119, 120, 909, 122, 909, 124, 750, 909, 127, 909, 129, 909, 131, 132], 'g4pE': [133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143], 'g2pC': [144, 145, 146, 147, 148, 149, 150, 151], 'g4pB': [152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163], 'g1pD': [164, 165, 166, 167], 'g4pD': [168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 18

In [40]:
import pyfpgrowth
transactions = []
inp = "taske"

In [41]:
for k, v in pro_dict.items():
    if k == inp:
        for name, vec in v.items():
            transactions.append(vec)
print(transactions)


[[407, 528, 622, 602, 603, 623], [413, 414, 415, 416, 417, 418, 419, 420, 421, 422], [423, 424, 622, 426, 603, 428, 632, 624], [431, 432, 433, 599, 559, 436, 437, 438, 439, 440], [441, 442, 443, 444, 445], [446, 528, 598, 599, 559, 600, 622, 602, 603, 623, 624, 457, 625, 626, 909, 628, 909, 630, 631, 909, 466, 467, 632, 633, 634, 635, 636, 637, 474, 638, 639], [477, 478, 479, 480, 481, 482], [483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 909, 494, 909, 496, 497], [498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513], [514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 909, 525, 526], [621, 528, 529, 530, 602, 603, 623, 624, 535, 536, 628, 630, 631], [540, 541, 542, 543, 544, 909, 546, 909, 548, 909, 550, 551, 552, 553, 554], [555, 556, 557, 599, 559, 560, 561, 562, 563, 624, 565, 566, 567], [568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 634, 579], [580, 581, 582, 583, 584, 585, 586, 587], [588, 589, 590, 591, 592, 593, 594, 595, 596], [597, 598, 599, 6

In [42]:
patterns = pyfpgrowth.find_frequent_patterns(transactions, 3)
frequent_list = []
for p in patterns:
    if len(p) < 3:
        continue
    else:
        frequent_list.append(list(p))
print(frequent_list[:5])

[[528, 602, 623], [528, 602, 603], [528, 602, 909], [528, 603, 623], [528, 623, 909]]


In [27]:
def is_sub(sub, lst):
    ln = len(sub)
    for i in range(len(lst) - ln + 1):
        if all(sub[j] == lst[i+j] for j in range(ln)):
            return True
    return False

In [37]:
output_dict= {}
for lst in frequent_list:
    for k, v in pro_dict.items():
        for name, vec in v.items():
            if is_sub(lst,vec):
                if k not in output_dict:
                    output_dict[k] = [name]
                else:
                    output_dict[k].append(name)

# print(output_dict)
for k, v in output_dict.items():
    print('Copied students for ', k, ' ', set(v))    

Copied students for  taske   {'g0pE', 'g2pB', 'orig', 'g1pB'}
Copied students for  taska   {'g0pE', 'orig', 'g2pC', 'g3pC', 'g4pC'}
Copied students for  taskd   {'orig', 'g2pA', 'g4pC', 'g3pA'}


In [29]:
import pandas as pd
xl = pd.ExcelFile("corpus_final.xls")

In [30]:
df = xl.parse("File list")


In [31]:
c = df[['File','Category']]



In [32]:
tp = 0
tn = 0
fn = 0
fp = 0
for key, value in output_dict.items():
    for v in value:
        ans = v+key+'.txt'
        head = key
        for index, row in c.iterrows():
            if row['File'].split('_')[1].split('.')[0] == head:
                if row['File'].split('_')[0] == v:
                    if row['Category'] == 'cut' or row['Category'] == 'heavy':
                        tp+=1
                    else:
                        fp+=1
                else:
                    if row['Category'] == 'cut' or row['Category'] == 'heavy':
                        fn +=1
                    else:
                        tn+=1

precision = float(tp/(tp+fp))                        
print('Precision is : ', precision)
recall = float(tp/(tp+fn))
print('Recall is : ', recall)
print('Accuracy is : ', (tp+tn)/(tp+fn+tn+fp))



                

Precision is :  0.7222222222222222
Recall is :  0.0693950177935943
Accuracy is :  0.6322624743677375
