In [1]:
# import libraries
import pandas as pd
import numpy as np
import io

In [2]:
from google.colab import files
uploaded = files.upload()

plagiarism_df = pd.read_csv(io.BytesIO(uploaded['file_information.csv']))

plagiarism_df.head(5)

Saving file_information.csv to file_information (2).csv


Unnamed: 0,File,Task,Category
0,g0pA_taska.txt,a,non
1,g0pA_taskb.txt,b,cut
2,g0pA_taskc.txt,c,light
3,g0pA_taskd.txt,d,heavy
4,g0pA_taske.txt,e,non


In [3]:
# Data description
print('Number of files: ', plagiarism_df.shape[0])  

print('Number of unique tasks/question types (A-E): ', (len(plagiarism_df['Task'].unique())))
print('Unique plagiarism categories: ', (plagiarism_df['Category'].unique()))

Number of files:  100
Number of unique tasks/question types (A-E):  5
Unique plagiarism categories:  ['non' 'cut' 'light' 'heavy' 'orig']


In [4]:
# Show counts by different tasks and amounts of plagiarism

# group by task
counts_per_task=plagiarism_df.groupby(['Task']).size().reset_index(name="Counts")
print("\nTask:")
display(counts_per_task)

# group by plagiarism level
counts_per_category=plagiarism_df.groupby(['Category']).size().reset_index(name="Counts")
print("\nPlagiarism Levels:")
display(counts_per_category)

# group by task and plagiarism level
counts_task_and_plagiarism=plagiarism_df.groupby(['Task', 'Category']).size().reset_index(name="Counts")
print("\nTask & Plagiarism Level Combos :")
display(counts_task_and_plagiarism.head())


Task:


Unnamed: 0,Task,Counts
0,a,20
1,b,20
2,c,20
3,d,20
4,e,20



Plagiarism Levels:


Unnamed: 0,Category,Counts
0,cut,19
1,heavy,19
2,light,19
3,non,38
4,orig,5



Task & Plagiarism Level Combos :


Unnamed: 0,Task,Category,Counts
0,a,cut,4
1,a,heavy,3
2,a,light,3
3,a,non,9
4,a,orig,1


In [5]:
# Data encoding
def numerical_dataframe(plagiarism_df):

   # a mapping from categorical labels to numeric labels
   CATEGORY_TO_NUM = {"non":0,"heavy":1,"light":2,"cut":3,"orig":-1}

   df = plagiarism_df
   for i in df.index:
      category_ = df.at[i, 'Category']
      df.at[i, 'Category'] = CATEGORY_TO_NUM[category_]
      df.at[i, 'Class'] = 0 if category_ == 'non' else -1 if category_ == 'orig' else 1

   df['Class'] = df['Class'].astype(int)
   return df

In [6]:
transformed_df = numerical_dataframe(plagiarism_df)

# check that all categories of plagiarism have a class label = 1
transformed_df.head(5)

Unnamed: 0,File,Task,Category,Class
0,g0pA_taska.txt,a,0,0
1,g0pA_taskb.txt,b,3,1
2,g0pA_taskc.txt,c,2,1
3,g0pA_taskd.txt,d,1,1
4,g0pA_taske.txt,e,0,0


In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [8]:
#Change the path to wherever your corpus folder is....

path = "/content/gdrive/MyDrive/corpus-20090418/"


In [9]:
#Functions
import re
import operator 

# Add 'datatype' column that indicates if the record is original wiki answer as 0, training data 1, test data 2, onto 
# the dataframe - uses stratified random sampling (with seed) to sample by task & plagiarism amount 

# Use function to label datatype for training 1 or test 2 
def create_datatype(df, train_value, test_value, datatype_var, compare_dfcolumn, operator_of_compare, value_of_compare,
                    sampling_number, sampling_seed):
    # Subsets dataframe by condition relating to statement built from:
    # 'compare_dfcolumn' 'operator_of_compare' 'value_of_compare'
    df_subset = df[operator_of_compare(df[compare_dfcolumn], value_of_compare)]
    df_subset = df_subset.drop(columns = [datatype_var])
    
    # Sets all datatype to value for training for df_subset
    df_subset.loc[:, datatype_var] = train_value
    
    # Performs stratified random sample of subset dataframe to create new df with subset values 
    df_sampled = df_subset.groupby(['Task', compare_dfcolumn], group_keys=False).apply(lambda x: x.sample(min(len(x), sampling_number), random_state = sampling_seed))
    df_sampled = df_sampled.drop(columns = [datatype_var])
    # Sets all datatype to value for test_value for df_sampled
    df_sampled.loc[:, datatype_var] = test_value
    
    # Labels all datatype_var column as train_value which will be overwritten to 
    # test_value in next for loop for all test cases chosen with stratified sample
    for index in df_sampled.index: 
        # Labels all datatype_var columns with test_value for straified test sample
        df_subset.loc[index, datatype_var] = test_value

    # Adds test_value and train_value for all relevant data in main dataframe
    for index in df_subset.index:
        # Labels all datatype_var columns in df with train_value/test_value based upon 
        # stratified test sample and subset of df
        df.loc[index, datatype_var] = df_subset.loc[index, datatype_var]

    # returns nothing because dataframe df already altered 
    
def train_test_dataframe(clean_df, random_seed=100):
    
    new_df = clean_df.copy()

    # Initialize datatype as 0 initially for all records - after function 0 will remain only for original wiki answers
    new_df.loc[:,'Datatype'] = 0

    # Creates test & training datatypes for plagiarized answers (1,2,3)
    create_datatype(new_df, 1, 2, 'Datatype', 'Category', operator.gt, 0, 1, random_seed)

    # Creates test & training datatypes for NON-plagiarized answers (0)
    create_datatype(new_df, 1, 2, 'Datatype', 'Category', operator.eq, 0, 2, random_seed)
    
    # creating a dictionary of categorical:numerical mappings for plagiarsm categories
    mapping = {0:'orig', 1:'train', 2:'test'} 

    # traversing through dataframe and replacing categorical data
    new_df.Datatype = [mapping[item] for item in new_df.Datatype] 

    return new_df


# function for pre-processing text given a file
def process_file(file):
    # put text in all lower case letters 
    all_text = file.read().lower()

    # remove all non-alphanumeric chars
    all_text = re.sub(r"[^a-zA-Z0-9]", " ", all_text)
    # remove newlines/tabs, etc. so it's easier to match phrases, later
    all_text = re.sub(r"\t", " ", all_text)
    all_text = re.sub(r"\n", " ", all_text)
    all_text = re.sub("  ", " ", all_text)
    all_text = re.sub("   ", " ", all_text)
    
    return all_text


def create_text_column(df, file_directory=path):
    ''':param df: A dataframe of file information including a column for `File`
       :param file_directory: the main directory where files are stored
       :return: A dataframe with processed text '''
   
    # create copy to modify
    text_df = df.copy()
    
    # store processed text
    text = []
    
    # for each file (row) in the df, read in the file 
    for row_i in df.index:
        filename = df.iloc[row_i]['File']
        #print(filename)
        file_path = file_directory + filename
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:

            file_text = process_file(file)
            # append processed text to list
            text.append(file_text)
    
    # add column to the copied dataframe
    text_df['Text'] = text
    
    return text_df

In [10]:
# create a text column 
text_df = create_text_column(transformed_df)
text_df.head()

Unnamed: 0,File,Task,Category,Class,Text
0,g0pA_taska.txt,a,0,0,inheritance is a basic concept of object orien...
1,g0pA_taskb.txt,b,3,1,pagerank is a link analysis algorithm used by ...
2,g0pA_taskc.txt,c,2,1,the vector space model also called term vector...
3,g0pA_taskd.txt,d,1,1,bayes theorem was names after rev thomas bayes...
4,g0pA_taske.txt,e,0,0,dynamic programming is an algorithm design tec...


In [11]:
print(text_df.iloc[0]['Text'])

inheritance is a basic concept of object oriented programming where the basic idea is to create new classes that add extra detail to existing classes this is done by allowing the new classes to reuse the methods and variables of the existing classes and new methods and classes are added to specialise the new class inheritance models the is kind of relationship between entities or objects  for example postgraduates and undergraduates are both kinds of student this kind of relationship can be visualised as a tree structure where student would be the more general root node and both postgraduate and undergraduate would be more specialised extensions of the student node or the child nodes  in this relationship student would be known as the superclass or parent class whereas  postgraduate would be known as the subclass or child class because the postgraduate class extends the student class  inheritance can occur on several layers where if visualised would display a larger tree structure for 

In [12]:
#train-test data split
complete_df = train_test_dataframe(text_df, random_seed=1)

# check results
complete_df.head(5)

Unnamed: 0,File,Task,Category,Class,Text,Datatype
0,g0pA_taska.txt,a,0,0,inheritance is a basic concept of object orien...,train
1,g0pA_taskb.txt,b,3,1,pagerank is a link analysis algorithm used by ...,test
2,g0pA_taskc.txt,c,2,1,the vector space model also called term vector...,train
3,g0pA_taskd.txt,d,1,1,bayes theorem was names after rev thomas bayes...,train
4,g0pA_taske.txt,e,0,0,dynamic programming is an algorithm design tec...,train


In [13]:
#Similarity feature used-> n-grams
from sklearn.feature_extraction.text import CountVectorizer

def containment(ngram_array):
    ''' Containment is a measure of text similarity. It is the normalized, 
       intersection of ngram word counts in two texts.
       :param ngram_array: an array of ngram counts for an answer and source text.
       :return: a normalized containment value.'''
    
    intersection_ = sum(np.amin(ngram_array,axis=0))
    union_ = sum(ngram_array[0])
    
    return intersection_ / union_

# Calculate the ngram containment for one answer file/source file pair in a df
def calculate_containment(df, n, answer_filename):
    '''Calculates the containment between a given answer text and its associated source text.
       This function creates a count of ngrams (of a size, n) for each text file in our data.
       Then calculates the containment by finding the ngram count for a given answer text, 
       and its associated source text, and calculating the normalized intersection of those counts.
       :param df: A dataframe with columns,
           'File', 'Task', 'Category', 'Class', 'Text', and 'Datatype'
       :param n: An integer that defines the ngram size
       :param answer_filename: A filename for an answer text in the df, ex. 'g0pB_taskd.txt'
       :return: A single containment value that represents the similarity
           between an answer text and its source text.
    '''

    # instantiate an ngram counter
    counts = CountVectorizer(analyzer='word', ngram_range=(n,n))

    # get the answer text and source text from the dataframe
    a_text = df.loc[df["File"] == answer_filename]["Text"].values[0]
    a_task = df.loc[df["File"] == answer_filename]["Task"].values[0]
    s_text = df.loc[(df["Task"] == a_task) & (df["Datatype"] == 'orig')]["Text"].values[0]

    # create array of n-gram counts for the answer and source text
    ngrams = counts.fit_transform([a_text, s_text])

    return containment(ngrams.toarray())
    

In [14]:
# select a value for n
n = 3

# indices for first few files
test_indices = range(5)

# iterate through files and calculate containment
category_vals = []
containment_vals = []
for i in test_indices:
    # get level of plagiarism for a given file index
    category_vals.append(complete_df.loc[i, 'Category'])
    # calculate containment for given file and n
    filename = complete_df.loc[i, 'File']
    c = calculate_containment(complete_df, n, filename)
    containment_vals.append(c)

# print out result
print('Original category values: \n', category_vals)
print()
print(str(n)+'-gram containment values: \n', containment_vals)

Original category values: 
 [0, 3, 2, 1, 0]

3-gram containment values: 
 [0.009345794392523364, 0.9641025641025641, 0.6136363636363636, 0.15675675675675677, 0.031746031746031744]


In [15]:
# Function returns a list of containment features, calculated for a given n 
# Should return a list of length 100 for all files in a complete_df
def create_containment_features(df, n, column_name=None):
    
    containment_values = []
    
    if(column_name==None):
        column_name = 'c_'+str(n) 
    
    # iterates through dataframe rows
    for i in df.index:
        file = df.loc[i, 'File']
        # Computes features using calculate_containment function
        if df.loc[i,'Category'] > -1:
            c = calculate_containment(df, n, file)
            containment_values.append(c)
        # Sets value to -1 for original tasks 
        else:
            containment_values.append(-1)
    
    print(str(n)+'-gram containment features created!')
    return containment_values


In [16]:
# Define an ngram range
ngram_range = range(1,11)

features_list = []

# Create features in a features_df
all_features = np.zeros((len(ngram_range), len(complete_df)))

# Calculate features for containment for ngrams in range
i=0
for n in ngram_range:
    column_name = 'c_'+str(n)
    features_list.append(column_name)
    # create containment features
    all_features[i]=np.squeeze(create_containment_features(complete_df, n))
    i+=1

# create a features dataframe
features_df = pd.DataFrame(np.transpose(all_features), columns=features_list)

# Print all features/columns
print()
print('Features: ', features_list)
print()

1-gram containment features created!
2-gram containment features created!
3-gram containment features created!
4-gram containment features created!
5-gram containment features created!
6-gram containment features created!
7-gram containment features created!
8-gram containment features created!
9-gram containment features created!
10-gram containment features created!

Features:  ['c_1', 'c_2', 'c_3', 'c_4', 'c_5', 'c_6', 'c_7', 'c_8', 'c_9', 'c_10']



In [17]:
features_df.head()

Unnamed: 0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10
0,0.398148,0.07907,0.009346,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.984694,0.964103,0.943299,0.92228,0.901042,0.879581,0.857895,0.835979,0.81383
2,0.869369,0.719457,0.613636,0.515982,0.449541,0.382488,0.319444,0.265116,0.219626,0.197183
3,0.593583,0.268817,0.156757,0.108696,0.081967,0.06044,0.044199,0.027778,0.011173,0.0
4,0.544503,0.115789,0.031746,0.005319,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Takes in dataframes and a list of selected features (column names) 
# and returns (train_x, train_y), (test_x, test_y)
def train_test_data(complete_df, features_df, selected_features):
   '''Gets selected training and test features from given dataframes, and 
      returns tuples for training and test features and their corresponding class labels.
      :param complete_df: A dataframe with all of our processed text data, datatypes, and labels
      :param features_df: A dataframe of all computed, similarity features
      :param selected_features: An array of selected features that correspond to certain columns in `features_df`
      :return: training and test features and labels: (train_x, train_y), (test_x, test_y)'''

   # get the training features
   train_x = (features_df[selected_features].iloc[complete_df.index[complete_df["Datatype"] == 'train'].tolist()]).values
   # And training class labels (0 or 1)
   train_y = ((complete_df[complete_df["Datatype"] == 'train'])['Class']).values

   # get the test features and labels
   test_x = (features_df[selected_features].iloc[complete_df.index[complete_df["Datatype"] == 'test'].tolist()]).values
   test_y = ((complete_df[complete_df["Datatype"] == 'test'])['Class']).values

   return (train_x, train_y), (test_x, test_y)
    

In [19]:
# Select your list of features, this should be column names from features_df
selected_features = ['c_1', 'c_5', 'c_10']

(train_x, train_y), (test_x, test_y) = train_test_data(complete_df, features_df, selected_features)

# check that division of samples seems correct
# these should add up to 95 (100 - 5 original files)
print('Training size: ', len(train_x))
print('Test size: ', len(test_x))
print()
print('Training df sample: \n', train_x[:10])
print('Training labels sample: \n', train_y[:10])

Training size:  70
Test size:  25

Training df sample: 
 [[0.39814815 0.         0.        ]
 [0.86936937 0.44954128 0.1971831 ]
 [0.59358289 0.08196721 0.        ]
 [0.54450262 0.         0.        ]
 [0.32950192 0.         0.        ]
 [0.59030837 0.         0.        ]
 [0.75977654 0.24571429 0.03529412]
 [0.51612903 0.         0.        ]
 [0.44086022 0.         0.        ]
 [0.97945205 0.78873239 0.5620438 ]]
Training labels sample: 
 [0 1 1 0 0 0 1 0 0 1]


In [20]:
#Function to create csv files
import os
def make_csv(x, y, filename, data_dir):
    '''Merges features and labels and converts them into one csv file with labels in the first column.
       :param x: Data features
       :param y: Data labels
       :param file_name: Name of csv file, ex. 'train.csv'
       :param data_dir: The directory where files will be saved
       '''
    # make data dir, if it does not exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    features_df = pd.DataFrame(x)
    labels_df = pd.DataFrame(y)
    final_df = labels_df.merge(features_df, how='inner', left_index=True, right_index=True)
    final_df.to_csv(data_dir + '/' + filename, header=False, index=False) 
    
    # nothing is returned, but a print statement indicates that the function has run
    print('Path created: '+str(data_dir)+'/'+str(filename))

In [21]:
# Choose directory to save train.csv and test.csv files
data_dir = 'plagiarism_data'

make_csv(train_x, train_y, filename='train.csv', data_dir=data_dir)
make_csv(test_x, test_y, filename='test.csv', data_dir=data_dir)

Path created: plagiarism_data/train.csv
Path created: plagiarism_data/test.csv


In [22]:
#Building model based on Linear Support Vector Machines
from sklearn import svm

#Read in csv training file
train_data = pd.read_csv(os.path.join(data_dir, "train.csv"), header=None, names=None)
print(train_data.head())

# Labels are in the first column
train_y = train_data.iloc[:,0]
train_x = train_data.iloc[:,1:]

# read in test data
test_data = pd.read_csv(os.path.join(data_dir, "test.csv"), header=None, names=None)

# labels are in the first column
test_y = test_data.iloc[:,0]
test_x = test_data.iloc[:,1:]

#Define a SVM model 
model = svm.LinearSVC()

#Train the model
model.fit(train_x, train_y)

#Evaluating the model, predicting the output for testing data 
test_y_pred = model.predict(test_x)
print(test_y_pred)

   0         1         2         3
0  0  0.398148  0.000000  0.000000
1  1  0.869369  0.449541  0.197183
2  1  0.593583  0.081967  0.000000
3  0  0.544503  0.000000  0.000000
4  0  0.329502  0.000000  0.000000
[1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 1 0 1 1 0 0]


In [23]:
# Calculate the test accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_y, test_y_pred)

print(accuracy)

# Print out the array of predicted and true labels
print('\nPredicted class labels: ')
print(test_y_pred)
print('\nTrue class labels: ')
print(test_y.values)

0.96

Predicted class labels: 
[1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 1 0 1 1 0 0]

True class labels: 
[1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 0]
