In [41]:
import hashlib
import pandas as pd
import numpy as np
import os

In [42]:
path = 'C:/Users/Linyi Zhang/Desktop/data_mining/HW1/'

In [43]:
csv_file = path + 'data/file_information.csv'
plagiarism_df = pd.read_csv(csv_file)

# print out the first few rows of data info
plagiarism_df.head()

Unnamed: 0,File,Task,Category
0,g0pA_taska.txt,a,non
1,g0pA_taskb.txt,b,cut
2,g0pA_taskc.txt,c,light
3,g0pA_taskd.txt,d,heavy
4,g0pA_taske.txt,e,non


In [44]:
# Read in a csv file and return a transformed dataframe
def numerical_dataframe(csv_file='data/file_information.csv'):
    '''Reads in a csv file which is assumed to have `File`, `Category` and `Task` columns.
       This function does two things: 
       1) converts `Category` column values to numerical values 
       2) Adds a new, numerical `Class` label column.
       The `Class` column will label plagiarized answers as 1 and non-plagiarized as 0.
       Source texts have a special label, -1.
       :param csv_file: The directory for the file_information.csv file
       :return: A dataframe with numerical categories and a new `Class` label column'''
    
    # read in csv file
    num_df = pd.read_csv(csv_file)
    
    # convert Category column to numerical values
    cat_num = {'non': 0, 'heavy': 1, 'light': 2, 'cut': 3, 'orig': -1}
    num_df['Category'] = num_df['Category'].map(cat_num)
    
    # add Class column
    num_df['Class'] = [ x if x < 1 else 1 for x in num_df['Category'] ]
    
    return num_df

In [45]:
# informal testing, print out the results of a called function
# create new `transformed_df`
transformed_df = numerical_dataframe(csv_file = path + 'data/file_information.csv')

# check work
# check that all categories of plagiarism have a class label = 1
transformed_df.head(10)

Unnamed: 0,File,Task,Category,Class
0,g0pA_taska.txt,a,0,0
1,g0pA_taskb.txt,b,3,1
2,g0pA_taskc.txt,c,2,1
3,g0pA_taskd.txt,d,1,1
4,g0pA_taske.txt,e,0,0
5,g0pB_taska.txt,a,0,0
6,g0pB_taskb.txt,b,0,0
7,g0pB_taskc.txt,c,3,1
8,g0pB_taskd.txt,d,2,1
9,g0pB_taske.txt,e,1,1


In [46]:
# test cell that creates `transformed_df`, if tests are passed

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""

# importing tests
import problem_unittests as tests

# test numerical_dataframe function
tests.test_numerical_df(numerical_dataframe)

# if above test is passed, create NEW `transformed_df`
transformed_df = numerical_dataframe(csv_file =path + 'data/file_information.csv')

# check work
print('\nExample data: ')
transformed_df.head()

Tests Passed!

Example data: 


Unnamed: 0,File,Task,Category,Class
0,g0pA_taska.txt,a,0,0
1,g0pA_taskb.txt,b,3,1
2,g0pA_taskc.txt,c,2,1
3,g0pA_taskd.txt,d,1,1
4,g0pA_taske.txt,e,0,0


In [47]:
"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
import helpers 

# create a text column 
text_df = helpers.create_text_column(transformed_df, path + 'data/')
text_df.head()

Unnamed: 0,File,Task,Category,Class,Text
0,g0pA_taska.txt,a,0,0,inheritance is a basic concept of object orien...
1,g0pA_taskb.txt,b,3,1,pagerank is a link analysis algorithm used by ...
2,g0pA_taskc.txt,c,2,1,the vector space model also called term vector...
3,g0pA_taskd.txt,d,1,1,bayes theorem was names after rev thomas bayes...
4,g0pA_taske.txt,e,0,0,dynamic programming is an algorithm design tec...


In [48]:
# after running the cell above
# check out the processed text for a single file, by row index
row_idx = 99 # feel free to change this index

sample_text = text_df.iloc[99]['Text']

print('Sample processed text:\n\n', sample_text)
print("total number of documents: ", len(text_df))

Sample processed text:

 in mathematics and computer science dynamic programming is a method of solving problems that exhibit the properties of overlapping subproblems and optimal substructure described below  the method takes much less time than naive methods the term was originally used in the 1940s by richard bellman to describe the process of solving problems where one needs to find the best decisions one after another by 1953 he had refined this to the modern meaning the field was founded as a systems analysis and engineering topic that is recognized by the ieee bellman s contribution is remembered in the name of the bellman equation a central result of dynamic programming which restates an optimization problem in recursive form the word programming in dynamic programming has no particular connection to computer programming at all and instead comes from the term mathematical programming  a synonym for optimization thus the program is the optimal plan for action that is produced fo

Shingling

In [49]:
class Shingling:
    def __init__(self, k):
        self.k = k  # Shingle size
        self.shingles = set()  # Set to store unique shingles
        self.hashed_shingles = []  # List to store hashed shingles in order

    def shingle_document(self, document):
        shingles = set()  
        hashed_shingles = []

        for i in range(len(document) - self.k + 1):
            shingle = document[i:i + self.k]
            shingle_hash = hashlib.sha1(shingle.encode()).hexdigest()  # Use SHA-1 for hashing
            shingles.add(shingle)
            hashed_shingles.append(shingle_hash)

        return shingles, hashed_shingles

    def process_document(self, document):
        shingles, hashed_shingles = self.shingle_document(document)
        self.shingles.update(shingles)
        self.hashed_shingles.extend(hashed_shingles)

    def get_ordered_hashed_shingles(self):
        return sorted(self.hashed_shingles)

path

In [60]:


# Initialize the shingling class with your desired shingle size
shingling = Shingling(k=10)  # You can choose an appropriate value for 'k'

# Loop through your files and process the content
# for filename in os.listdir(directory):
#     if filename.endswith(".txt"):
#         with open(os.path.join(directory, filename), 'r') as file:
#             content = file.read()
#             shingling.process_document(content)

for i in range(len(text_df)):
    # print(i)
    shingling.process_document(text_df.iloc[i]['Text'])


print("Number of unique shingles:", len(shingling.shingles))
print("Number of total shingles:", len(shingling.hashed_shingles))

# Get the ordered set of hashed shingles for the entire document
ordered_hashed_shingles = shingling.get_ordered_hashed_shingles()
print("Ordered Hashed Shingles:", ordered_hashed_shingles[0])
print("Ordered Hashed Shingles:", ordered_hashed_shingles[3])

Number of unique shingles: 61077
Number of total shingles: 128167
Ordered Hashed Shingles: 0000764426739309134ec17c53eeb1f57d731e08
Ordered Hashed Shingles: 0001bfbb7d651040a00b0b047e2153349aa0752d
