Add the dictory one level up to the sys path so we can find srv as per
https://stackoverflow.com/a/4383597


In [3]:
import sys
sys.path.insert(1, '../') 

In [4]:
import pickle
import os
import numpy as np

In [5]:
SEARCH_TERM = 'poem'
SAVE_DIR = 'data'

In [6]:
os.getcwd()

'/home/d14xj1/repos/plagiarism_detection/medium'

# Create Features

In [7]:
from src.create_features import ngram_array, containment, calculate_containment

In [8]:
# load data
filename_clean_results = SEARCH_TERM.replace(' ', '_') +'_results_clean.p'  
results = pickle.load(open(os.path.join(SAVE_DIR, filename_clean_results), "rb"))

In [9]:
results.keys()

dict_keys(['links_worked', 'articles', 'author', 'junk', 'links_failed'])

In [10]:
filename_clean_results

'poem_results_clean.p'

In [11]:
len(results['links_worked'])
len(results['articles'])

867

In [12]:
assert len(results['links_worked']) == len(results['articles']) == len(results['author']), 'links/articles/authors should all be same length'

In [13]:
# create combinations to test
import itertools
import pandas as pd

article_indices = list(range(len(results['articles'])))
combinations = list(itertools.combinations(article_indices,  2))
results_df = pd.DataFrame(combinations, columns = ['A', 'B'])

In [14]:
results_df.shape

(375411, 2)

In [15]:
results_df.head()

Unnamed: 0,A,B
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5


In [16]:
# loop over rows and populate author
n_rows = results_df.shape[0]
author_A, author_B, link_A, link_B, article_A, article_B = [], [], [], [], [], []

for row in range(n_rows):
    author_A.append(results['author'][results_df.loc[row, 'A']])
    author_B.append(results['author'][results_df.loc[row, 'B']])
    link_A.append(results['links_worked'][results_df.loc[row, 'A']])
    link_B.append(results['links_worked'][results_df.loc[row, 'B']])
    article_A.append(results['articles'][results_df.loc[row, 'A']])
    article_B.append(results['articles'][results_df.loc[row, 'B']])

In [17]:
results_df['author_A'] = author_A
results_df['author_B'] = author_B
results_df['link_A'] = link_A
results_df['link_B'] = link_B
results_df['article_A'] = article_A
results_df['article_B'] = article_B

In [19]:
results_df.head()

Unnamed: 0,A,B,author_A,author_B,link_A,link_B,article_A,article_B
0,0,1,D. Wyn Price,The Awl,https://medium.com/blueinsight/a-life-in-shade...,https://medium.com/the-awl/a-poem-by-amit-majm...,A Life in Shades of Purple - Blue Insights - M...,A Poem by Amit Majmudar - The Awl - MediumOpen...
1,0,2,D. Wyn Price,D. Wyn Price,https://medium.com/blueinsight/a-life-in-shade...,https://medium.com/publishous/message-in-a-bot...,A Life in Shades of Purple - Blue Insights - M...,Message in a Bottle: Help. - Publishous - Medi...
2,0,3,D. Wyn Price,Kristen Arnett,https://medium.com/blueinsight/a-life-in-shade...,https://medium.com/the-shocker/i-was-a-dennis-...,A Life in Shades of Purple - Blue Insights - M...,I Was A Dennis Scott Hot Shot: An Essay About ...
3,0,4,D. Wyn Price,Rahul Misra,https://medium.com/blueinsight/a-life-in-shade...,https://medium.com/@rahulmisra722/every-time-i...,A Life in Shades of Purple - Blue Insights - M...,Every Time I Write A Poem - Rahul Misra - Medi...
4,0,5,D. Wyn Price,Joi Lake,https://medium.com/blueinsight/a-life-in-shade...,https://medium.com/illumination/a-silly-smitte...,A Life in Shades of Purple - Blue Insights - M...,A Silly Smitten Poem - ILLUMINATION - MediumOp...


In [24]:
results_test = results_df[:10000]
results_test.shape

(10000, 8)

In [21]:
def containment_wrapper(article_A, article_B, n_gram_choice = 20):
    if (article_A == 'Blank') or (article_B == 'Blank'):
        containment = None
    else:
        try:
            containment = calculate_containment(article_A, article_B, n_gram_choice)
        except:
            print(f"We couldn't calculate the containment for row {row}")
            containment = None
    return containment

In [25]:
import time

In [27]:
%%time
tic = time.perf_counter()
pairwise_containment = [containment_wrapper(x, y) for x, y in zip(results_test['article_A'], results_test['article_B'])]
toc = time.perf_counter()
print(f"code ran in  {toc - tic:0.4f} seconds")

code ran in  22.9576 seconds
CPU times: user 22.9 s, sys: 16 ms, total: 23 s
Wall time: 23 s


In [None]:
pairwise_containment = []
n_gram_choice = 20

for row in range(n_rows):
# for row in range(35000):
    if (row % 5000) == 0:
        print (f"row {row} of {n_rows} which is {row/n_rows:.2f}")
   
        
    A = results_df.loc[row, 'A']
    B = results_df.loc[row, 'B']
    
    article_A = results['articles'][A]
    article_B = results['articles'][B]
    
    if (article_A == 'Blank') or (article_B == 'Blank'):
        containment = None
    else:
        try:
            containment = calculate_containment(article_A, article_B, n_gram_choice)
        except:
            print(f"We couldn't calculate the containment for row {row}")
            containment = None
        
    pairwise_containment.append(containment)

In [None]:
pairwise_containment

In [None]:
results_df.loc[188519]

In [None]:
results['articles'][332]

In [None]:
test = [1]

In [None]:
test.append(None)
test
pd.DataFrame(test)

In [None]:
results_df['containment'] = pairwise_containment

results_df.sort_values('containment', 
                       ascending = False,
                       inplace = True)

In [None]:
valid_comparison = results_df.loc[results_df['author_A'] != results_df['author_B']].reset_index(drop=True)
# valid_comparison = valid_comparison.loc[~valid_comparison['author_A'].isin(['Carlos E. Perez'])].reset_index(drop = True)
#valid_comparison = valid_comparison.loc[~valid_comparison['author_A'].isin(['Ketul G', 'Naveenan', 'Rickyyuan', 'takkii', 'Nilimesh Halder'])].reset_index(drop = True)
valid_comparison = valid_comparison.loc[~valid_comparison['link_A'].str.contains('blueinsight')].reset_index(drop = True)
valid_comparison.head(20)


In [None]:
idx = 3
print(valid_comparison.loc[idx, 'link_A'])
print(valid_comparison.loc[idx, 'link_B'])

In [None]:
results['articles'][valid_comparison.loc[idx, 'A']]

In [None]:
results['articles'][valid_comparison.loc[idx, 'B']]

In [None]:
results['articles'][469]

In [None]:
results['articles'][597]

In [None]:
filename_results = SEARCH_TERM.replace(' ', '_') +'_summary_table.p'   
pickle.dump(valid_comparison, open(os.path.join(SAVE_DIR, filename_results), "wb"))


# LCS

In [None]:
from src.create_features import lcs_norm_word

In [None]:
longest_common_subsequence = []

#for row in range(n_rows):
for row in range(35000):
    if (row % 100) == 0:
        print (f"row {row} of {n_rows} which is {row/n_rows:.2f}")
   
    A = results_df.loc[row, 'A']
    B = results_df.loc[row, 'B']
    
    article_A = results['articles'][A]
    article_B = results['articles'][B]
    
    if (article_A == 'Blank') or (article_B == 'Blank'):
        lcs = None
    else:
        try:
            lcs = lcs_norm_word(article_A, article_B)
        except:
            print(f"We couldn't calculate the containment for row {row}")
            lcs = None
        
    longest_common_subsequence.append(containment)

In [None]:
a = 1
a

In [1]:
import multiprocessing