In [1]:
from __future__ import division
from sklearn.cluster import KMeans
from numbers import Number
from pandas import DataFrame
import sys, codecs, numpy

In [2]:
class autovivify_list(dict):
  def __missing__(self, key):
    '''Given a missing key, set initial value to an empty list'''
    value = self[key] = []
    return value

  def __add__(self, x):
    '''Override addition for numeric types when self is empty'''
    if not self and isinstance(x, Number):
      return x
    raise ValueError

  def __sub__(self, x):
    '''Also provide subtraction method'''
    if not self and isinstance(x, Number):
      return -1 * x
    raise ValueError

In [3]:
def build_word_vector_matrix(vector_file, n_words):
  '''Return the vectors and labels for the first n_words in vector file'''
  numpy_arrays = []
  labels_array = []
  with codecs.open(vector_file, 'r', 'utf-8') as f:
    for c, r in enumerate(f):
      sr = r.split()
      labels_array.append(sr[0])
      numpy_arrays.append( numpy.array([float(i) for i in sr[1:]]) )

      if c == n_words:
        return numpy.array( numpy_arrays ), labels_array

  return numpy.array( numpy_arrays ), labels_array

In [4]:
def find_word_clusters(labels_array, cluster_labels):
  '''Return the set of words in each cluster'''
  cluster_to_words = autovivify_list()
  for c, i in enumerate(cluster_labels):
    cluster_to_words[ i ].append( labels_array[c] )
  return cluster_to_words

In [5]:
input_vector_file = '/Users/varunnandu/glove/vectors.txt' 
n_words = int(1262) # Number of words to analyze 
reduction_factor = float(0.2) # Amount of dimension reduction {0,1}
n_clusters = int( n_words * reduction_factor ) # Number of clusters to make
df, labels_array = build_word_vector_matrix(input_vector_file, n_words)
kmeans_model = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
kmeans_model.fit(df)


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=252, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [6]:
cluster_labels  = kmeans_model.labels_
cluster_inertia   = kmeans_model.inertia_
cluster_to_words  = find_word_clusters(labels_array, cluster_labels)

clusters = []
for c in cluster_to_words:
    clusters.append(cluster_to_words[c])

In [7]:
clust_dict = {}
for clus in clusters:
    if len(clus) == 1:
        if 0 not in clust_dict:
            clust_dict[0] = clus
        else:
            clust_dict[0] += clus
    else:
        clust_dict[clusters.index(clus)] = clus
for key, value in clust_dict.items():
    print(key, '    ', value)

0      ['the', 'of', 'is', 'to', 'and', '05', 'in', 'that', 'be', 'as', 'it', 'by', 'are', 'for', 'can', 'this', 'or', 'used', 'which', 'an', 'vector', 'pagerank', 'term', 'on', 'from', 'classes', 'has', 'with', 'have', 'one', '06', 'inheritance', 'problems', 'optimal', 'each', 'probabilities', 'google', 'also', 'class', 'number', 'links', 'given', 'we', 'these', 'example', 'more', 'new', 'where', 'if', 'any', 'methods', 'solutions', 'sub', 'into', 'means', 'at', 'known', 'computer', 'objects', 'been', 'values', 'will', 'no', 'there', 'same', 'they', 'program', 'another', 'derived', 'order', 'often', 'based', 'could', 'mathematical', 'after', 'science', 'base', 'algebraic', 'sense']
17      ['probability', 'theory', 'prior']
18      ['programming', 'dynamic']
28      ['h0k', 'he', 'ubx', 'hhk', 'hij', 'hijses']
29      ['all', 'common']
31      ['theorem', 'bayes']
34      ['page', 'other', 'pages', 'web']
35      ['document', 'query']
40      ['documents', 'vectors']
43      ['informa

In [8]:
import os, sys, nltk, re

# Open a file
path = "/Users/varunnandu/Desktop/plagiarism-detection-nlp/DATA-NLP"
dirs = os.listdir( path )



In [9]:
question_mapping = dict() # Mapping to question dictionary
 # Mapping to student as key and answer as value

for file in dirs:
    if file == '.DS_Store':
            continue
    path_to_file='/Users/varunnandu/Desktop/plagiarism-detection-nlp/DATA-NLP/'
    file_name = file
    split_name = file.split('_')
    student_name = split_name[0]
    question_number = split_name[1].split('.')[0]
    if question_number not in question_mapping:
        question_mapping[question_number] = {}
    path_to_file += file_name
    with open(path_to_file, 'r', errors = 'ignore') as f:
        mylist = f.read()
        sent_tokenize_list = nltk.sent_tokenize(mylist)
        temp = []
        for i in sent_tokenize_list:
            sent = i.lower()
            sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
            temp.append(sent)
        question_mapping[question_number][student_name] = temp
        

In [10]:
vector_dict = {}
for key, value in question_mapping.items():
    vector_dict[key] = {}
    for student, answer in value.items():
        vector_dict[key][student] = []
        for sent in answer:
            words = sent.split(' ')
            temp_list = []
            for w in words:
                for cluster_key, cluster_value in clust_dict.items():
                    if w in clust_dict[cluster_key]:
                        temp_list.append( cluster_key)
            vector_dict[key][student].append(temp_list)
            
for k, v in vector_dict.items():
    print(k, '    ', v)
    break


taskc      {'g0pC': [[0, 0, 45, 45, 0, 0, 0, 35, 0, 0, 0, 51, 0, 0, 0, 0, 83], [0, 35, 0, 0, 0, 0, 102, 0, 237], [0, 0, 0, 68, 0, 0, 124, 0, 102, 237], [0, 0, 0, 133, 0, 0, 0, 0, 0, 35, 0, 0, 0, 0, 0, 0, 0, 0, 58, 0, 0, 61, 228, 61, 152, 70, 40], [0, 0, 58, 0, 61, 0, 70, 40, 0, 120, 0, 0, 133], [0, 35, 0, 83, 45, 0, 35, 0, 0, 0, 90, 0, 0, 68, 58, 58, 0, 35, 0, 0, 0, 61, 138, 0, 152, 35], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 96, 0, 0, 133, 120, 0, 61, 138, 61, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 58, 0, 110, 0, 68, 0, 0, 0], [0, 245]], 'g0pE': [[0, 244, 0, 119, 0, 40, 0, 40, 0, 29, 0, 45, 0, 0, 0, 0, 0, 45, 0, 45, 45, 0, 0, 0, 0, 43, 43, 114, 40, 0, 35, 35, 0, 35, 0, 86, 0, 244, 0, 45, 243, 0, 0, 0, 0, 251, 0, 248, 0, 40, 0, 0, 0, 0, 45, 0, 0, 35, 118]], 'g1pA': [[0, 0, 45, 45, 0, 0, 0, 45, 0, 0, 224, 238, 40, 0, 0, 0, 87, 0, 40, 0, 99, 0, 139, 248, 51], [65, 131, 152, 43, 241, 43, 43, 241, 0, 241, 241], [0, 0, 0, 45, 40, 0, 90, 0, 40], [0, 237, 237, 0, 237, 0], [0, 65, 0,

In [11]:
vector_name = {}
rep = 0
for key, value in vector_dict.items():
    for student, vector in value.items():
        for v in vector:
            vector_name[tuple(v)] = rep
            rep+=1
        

        

In [12]:
pro_dict = {}
for key, value in vector_dict.items():
    pro_dict[key] = {}
    for student, answer in value.items():
        pro_dict[key][student] = []
        for v in answer:
            ans = tuple(v)
            pro_dict[key][student].append(vector_name[ans])

for key, value in pro_dict.items():
    print(key, '    ', value)
    break
    
print(len(pro_dict.keys()))

taskc      {'g0pC': [0, 1, 2, 3, 4, 5, 6, 7, 8], 'g0pE': [9], 'g1pA': [10, 11, 12, 76, 14, 15, 16, 17, 18, 19, 20], 'g3pB': [21, 22, 23, 24, 25, 26, 27, 28, 29], 'g0pB': [30, 74, 153, 108, 137, 78, 133, 134, 82, 39, 40, 138, 42], 'g2pA': [43, 153, 45, 46, 47, 48, 49, 50, 51, 133, 53, 82, 55, 56, 57, 86, 909, 60], 'g0pD': [61, 62, 63, 64, 65, 66, 67, 68, 137, 70, 71], 'orig': [72, 153, 74, 155, 76, 137, 78, 79, 133, 134, 82, 83, 84, 85, 86, 909, 118], 'g3pC': [89, 90, 91, 92, 93, 94, 95], 'g2pB': [96, 97, 98, 99, 100, 101, 102, 103, 104, 105], 'g0pA': [106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118], 'g4pC': [119, 120, 909, 122, 909, 124, 750, 909, 127, 909, 129, 909, 131, 132], 'g4pE': [133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143], 'g2pC': [144, 145, 146, 147, 148, 149, 150, 151], 'g4pB': [152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163], 'g1pD': [164, 165, 166, 167], 'g4pD': [168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 18

In [13]:
import pyfpgrowth
inp = ["taske", "taska", "taskb", "taskc", "taskd"]

In [14]:
def fp_growth(transactions):
    patterns = pyfpgrowth.find_frequent_patterns(transactions, 3)
    frequent_list = []
    for p in patterns:
        if len(p) < 3:
            continue
        else:
            frequent_list.append(list(p))
    return frequent_list


In [15]:
def is_sub(sub, lst):
    ln = len(sub)
    for i in range(len(lst) - ln + 1):
        if all(sub[j] == lst[i+j] for j in range(ln)):
            return True
    return False

In [16]:
def output(frequent_list):
    output_dict= {}
    for lst in frequent_list:
        for k, v in pro_dict.items():
            for name, vec in v.items():
                if is_sub(lst,vec):
                    if k not in output_dict:
                        output_dict[k] = [name]
                    else:
                        output_dict[k].append(name)
    return output_dict

  

In [23]:
final= {}
for task in inp:
    transactions = []
    for k, v in pro_dict.items():
        if k == task:
            for name, vec in v.items():
                transactions.append(vec)
            frequent_list = fp_growth(transactions)
    output_dict = output(frequent_list)
    for key, value in output_dict.items():
        final[key] = set(value)
        
# print(transactions)

print(final)



{'taske': {'orig', 'g0pE', 'g2pB', 'g1pB'}, 'taska': {'g0pE', 'g3pC', 'g2pC', 'g4pC', 'orig'}, 'taskd': {'g2pA', 'g4pC', 'g3pA'}}


In [18]:
import pandas as pd
xl = pd.ExcelFile("corpus_final.xls")

In [19]:
df = xl.parse("File list")


In [20]:
c = df[['File','Category']]



In [30]:
tp = 0
tn = 0
fn = 0
fp = 0
for key, value in final.items():
    for v in value:
        ans = v+key+'.txt'
        head = key
        for index, row in c.iterrows():
            if row['File'].split('_')[1].split('.')[0] == head:
                if row['File'].split('_')[0] == v:
                    if row['Category'] == 'cut' or row['Category'] == 'heavy':
                        tp+=1
                    else:
                        fp+=1
                else:
                    if row['Category'] == 'cut' or row['Category'] == 'heavy':
                        fn +=1
                    else:
                        tn+=1

precision = float(tp/(tp+fp))                        
print('Precision is : ', precision)
recall = float(tp/(tp+fn))
print('Recall is : ', recall)
print('Accuracy is : ', (tp+tn)/(tp+fn+tn+fp))
print('F1 Score is : ', 2*precision*recall/(precision+recall))



                

Precision is :  0.9
Recall is :  0.5934065934065934
Accuracy is :  0.6359649122807017
F1 Score is :  0.7152317880794702
