Add the dictory one level up to the sys path so we can find srv as per
https://stackoverflow.com/a/4383597


In [1]:
import sys
sys.path.insert(1, '../') 

In [2]:
import pickle
import os
import numpy as np

In [3]:
SEARCH_TERM = 'xgboost'
SAVE_DIR = 'data'

In [4]:
os.getcwd()

'/home/d14xj1/repos/plagiarism_detection/medium'

# Create Features

In [5]:
from src.create_features import ngram_array, containment, calculate_containment

In [6]:
# load data
filename_clean_results = SEARCH_TERM.replace(' ', '_') +'_results_clean.p'  
results = pickle.load(open(os.path.join(SAVE_DIR, filename_clean_results), "rb"))

In [7]:
results.keys()

dict_keys(['links_worked', 'articles', 'author', 'junk', 'links_failed'])

In [8]:
filename_clean_results

'xgboost_results_clean.p'

In [9]:
len(results['links_worked'])
len(results['articles'])

618

In [10]:
assert len(results['links_worked']) == len(results['articles']) == len(results['author']), 'links/articles/authors should all be same length'

In [11]:
# create combinations to test
import itertools
import pandas as pd

article_indices = list(range(len(results['articles'])))
combinations = list(itertools.combinations(article_indices,  2))
results_df = pd.DataFrame(combinations, columns = ['A', 'B'])

In [12]:
results_df.shape

(190653, 2)

In [13]:
results_df.head()

Unnamed: 0,A,B
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5


In [14]:
# loop over rows and populate author
n_rows = results_df.shape[0]
author_A, author_B, link_A, link_B, article_A, article_B, article_pairs = [], [], [], [], [], [], []

for row in range(n_rows):
    author_A.append(results['author'][results_df.loc[row, 'A']])
    author_B.append(results['author'][results_df.loc[row, 'B']])
    link_A.append(results['links_worked'][results_df.loc[row, 'A']])
    link_B.append(results['links_worked'][results_df.loc[row, 'B']])
    article_A.append(results['articles'][results_df.loc[row, 'A']])
    article_B.append(results['articles'][results_df.loc[row, 'B']])
    article_pairs.append([results['articles'][results_df.loc[row, 'A']],
                   results['articles'][results_df.loc[row, 'B']]])

In [15]:
results_df['author_A'] = author_A
results_df['author_B'] = author_B
results_df['link_A'] = link_A
results_df['link_B'] = link_B
results_df['article_A'] = article_A
results_df['article_B'] = article_B

# Remove blanks

In [16]:
# blank_idx = (results_df['author_A'] != 'Blank') & (results_df['author_B'] != 'Blank')


In [17]:
# results_df = results_df[]

In [18]:
# results_df.shape

In [19]:
def containment_wrapper(article_A, article_B, n_gram_choice = 20):
    try:
        containment = calculate_containment(article_A, article_B, n_gram_choice)
    except:
        print(f"We couldn't calculate the containment for row {row}")
        containment = None
    return containment

In [20]:
import time

In [21]:
results_df = results_df[:5000]
article_pairs = article_pairs[:5000]

# Using list comprehension

In [22]:
%%time
tic = time.perf_counter()
pairwise_containment = [containment_wrapper(x, y) for x, y in zip(results_df['article_A'], results_df['article_B'])]
toc = time.perf_counter()
print(f"code ran in  {toc - tic:0.4f} seconds")

  containment_val =  intersection / answer_cnt


code ran in  33.3344 seconds
CPU times: user 33.3 s, sys: 12.2 ms, total: 33.3 s
Wall time: 33.3 s


# Making one iterable

In [23]:
%%time
tic = time.perf_counter()
pairwise_containment = [containment_wrapper(x[0], x[1]) for x in article_pairs ]
toc = time.perf_counter()
print(f"code ran in  {toc - tic:0.4f} seconds")

code ran in  32.8679 seconds
CPU times: user 32.8 s, sys: 24.2 ms, total: 32.9 s
Wall time: 32.9 s


# Using parallel processing

In [24]:
import multiprocessing

In [25]:
# create as many processes as there are CPUs on your machine
num_processes = multiprocessing.cpu_count()
num_processes

8

In [26]:
# create function that we will paralellise
def containment_fun(article_pair):
    return containment_wrapper(article_pair[0], article_pair[1])

In [27]:
# check it works
len(article_pairs[0])

2

In [28]:
containment_results = pd.DataFrame(pairwise_containment, columns = ['containment'])
containment_results.sort_values('containment', ascending = False)

Unnamed: 0,containment
2369,0.240741
2193,0.185185
2162,0.185185
2136,0.185185
2137,0.185185
...,...
3070,
3071,
3072,
3073,


In [29]:
assert containment_wrapper(article_pairs[2369][0], article_pairs[2369][1]) == containment_fun(article_pairs[2369])

In [30]:
pool = multiprocessing.Pool(num_processes)

  containment_val =  intersection / answer_cnt
  containment_val =  intersection / answer_cnt
  containment_val =  intersection / answer_cnt
  containment_val =  intersection / answer_cnt


In [31]:
tic = time.perf_counter()
pairwise_containment = pool.map(containment_fun, article_pairs)
toc = time.perf_counter()
print(f"code ran in  {toc - tic:0.4f} seconds")

code ran in  8.5707 seconds


# with progress bar

In [33]:
from tqdm import tqdm
num_processes = multiprocessing.cpu_count()
    
print('running in parallel')
with multiprocessing.Pool(num_processes) as pool:
    pairwise_containment = list(tqdm(pool.imap(containment_fun, article_pairs), total = len(article_pairs)))
    pool.close()
    pool.join()

running in parallel


  containment_val =  intersection / answer_cnt
  containment_val =  intersection / answer_cnt
  containment_val =  intersection / answer_cnt
  containment_val =  intersection / answer_cnt
  containment_val =  intersection / answer_cnt
  containment_val =  intersection / answer_cnt
  containment_val =  intersection / answer_cnt
  containment_val =  intersection / answer_cnt
100%|██████████| 5000/5000 [00:08<00:00, 573.35it/s] 


In [34]:
len(pairwise_containment)

5000

In [None]:
sff