# Investigate Plagiarism

# Initialise

In [2]:
MODEL_DIR = '../model'
DATA_DIR = '../data'

import sys
sys.path.insert(1, '../') 

# Load Model

In [3]:
from src.utils.model import BinaryClassifier
import os
import torch

def model_fn(model_dir):
    """Load the PyTorch model from the `model_dir` directory."""
    print("Loading model.")

    # First, load the parameters used to create the model.
    model_info = {}
    model_info_path = os.path.join(model_dir, 'model_info.pth')
    with open(model_info_path, 'rb') as f:
        model_info = torch.load(f)

    print("model_info: {}".format(model_info))

    # Determine the device and construct the model.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BinaryClassifier(model_info['input_features'], model_info['hidden_dim'], model_info['output_dim'])

    # Load the stored model parameters.
    model_path = os.path.join(model_dir, 'model.pth')
    with open(model_path, 'rb') as f:
        model.load_state_dict(torch.load(f))

    # set to eval mode, could use no_grad
    model.to(device).eval()

    print("Done loading model.")
    return model

In [4]:
model = model_fn(MODEL_DIR)

Loading model.
model_info: {'input_features': 2, 'hidden_dim': 7, 'output_dim': 1}
Done loading model.


In [5]:
# Provided predict function
def predict_fn(input_data, model):
    print('Predicting class probabilities for the input data...')

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Process input_data so that it is ready to be sent to our model.
    data = torch.from_numpy(input_data.astype('float32'))
    data = data.to(device)

    # Put the model into evaluation mode
    model.eval()

    # Predicted scores
    probabilities = model(data).cpu().detach().numpy()

    return probabilities

In [6]:
os.getcwd()

'/home/d14xj1/repos/plagiarism_detection/medium/notebooks'

In [7]:
import pickle
feature_matrix = pickle.load(open(os.path.join(DATA_DIR, 'coronavirus_feature_matrix.p'), 'rb'))

In [8]:
feature_matrix.head()

Unnamed: 0,A,B,author_A,author_B,link_A,link_B,article_A,article_B,c_20,lcs_word
0,0,1,Blank,Bloomberg,https://towardsdatascience.com/its-time-to-get...,https://medium.com/bloomberg/can-we-get-a-vacc...,Blank,Can We Get a Vaccine Early? How the Rich Are P...,,0.0
1,0,2,Blank,Emily Mullin,https://towardsdatascience.com/its-time-to-get...,https://onezero.medium.com/new-testing-tech-co...,Blank,"Scientists Are Racing to Get Us Faster, Simple...",,0.0
2,0,3,Blank,Rebekah Monson,https://towardsdatascience.com/its-time-to-get...,https://medium.com/whereby-us/5-practical-tips...,Blank,5 practical tips for managing newly remote tea...,,0.0
3,0,4,Blank,Aaron Schnoor,https://towardsdatascience.com/its-time-to-get...,https://medium.com/lessons-from-history/is-the...,Blank,Is the Coronavirus the New Black Plague? - Les...,,0.0
4,0,5,Blank,Slava Kurilyak,https://towardsdatascience.com/its-time-to-get...,https://blog.produvia.com/artificial-intellige...,Blank,Coronavirus Update: Real Estate and Artificial...,,0.0


In [9]:
feature_matrix[['c_20', 'lcs_word']].head()

Unnamed: 0,c_20,lcs_word
0,,0.0
1,,0.0
2,,0.0
3,,0.0
4,,0.0


In [10]:
feature_matrix['curtailment_prob'] = predict_fn(feature_matrix[['c_20', 'lcs_word']].values, model)

Predicting class probabilities for the input data...


In [11]:
feature_matrix.sort_values('curtailment_prob', ascending = False, inplace = True, ignore_index = True)

In [12]:
feature_matrix.head()

Unnamed: 0,A,B,author_A,author_B,link_A,link_B,article_A,article_B,c_20,lcs_word,curtailment_prob
0,304,606,Kawandeep Virdee,Marley K.,https://medium.com/@whichlight/anxieties-affir...,https://medium.com/@marleyk/going-from-have-to...,Anxieties & Affirmations During the Coronaviru...,Going From Have to Have Not During The Coronav...,0.061538,0.527473,0.915797
1,304,915,Kawandeep Virdee,Tomas Pueyo,https://medium.com/@whichlight/anxieties-affir...,https://medium.com/@tomaspueyo/coronavirus-how...,Anxieties & Affirmations During the Coronaviru...,Coronavirus: How to Do Testing and Contact Tra...,0.061538,0.516484,0.907947
2,304,448,Kawandeep Virdee,Jim Scheinman,https://medium.com/@whichlight/anxieties-affir...,https://medium.com/@jimscheinman/hope-after-th...,Anxieties & Affirmations During the Coronaviru...,Hope after the Coronavirus Pandemic: Are These...,0.061538,0.516484,0.907947
3,304,421,Kawandeep Virdee,Tomas Pueyo,https://medium.com/@whichlight/anxieties-affir...,https://medium.com/@tomaspueyo/coronavirus-pre...,Anxieties & Affirmations During the Coronaviru...,Coronavirus: Prevent Seeding and Spreading - T...,0.061538,0.505495,0.899446
4,304,350,Kawandeep Virdee,Marta Brzosko,https://medium.com/@whichlight/anxieties-affir...,https://medium.com/@martabrzosko/what-if-the-c...,Anxieties & Affirmations During the Coronaviru...,What If The Coronavirus Is An Opportunity? - M...,0.061538,0.505495,0.899446


# Investigating Further

In [13]:
for idx in range(5):
    print(f"Articles for suspect {idx}")
    print(feature_matrix.loc[idx, 'link_A'])
    print(feature_matrix.loc[idx, 'link_B'])
    print('\n')

Articles for suspect 0
https://medium.com/@whichlight/anxieties-affirmations-during-the-coronavirus-68cf0c84389e?source=search_post
https://medium.com/@marleyk/going-from-have-to-have-not-during-the-coronavirus-9d4f3a8ef4be?source=search_post


Articles for suspect 1
https://medium.com/@whichlight/anxieties-affirmations-during-the-coronavirus-68cf0c84389e?source=search_post
https://medium.com/@tomaspueyo/coronavirus-how-to-do-testing-and-contact-tracing-bde85b64072e?source=search_post


Articles for suspect 2
https://medium.com/@whichlight/anxieties-affirmations-during-the-coronavirus-68cf0c84389e?source=search_post
https://medium.com/@jimscheinman/hope-after-the-coronavirus-pandemic-are-these-10-new-consumer-behaviors-here-to-stay-cafa2d76981b?source=search_post


Articles for suspect 3
https://medium.com/@whichlight/anxieties-affirmations-during-the-coronavirus-68cf0c84389e?source=search_post
https://medium.com/@tomaspueyo/coronavirus-prevent-seeding-and-spreading-e84ed405e37d?source

In [14]:
len(feature_matrix.loc[0, 'article_A'].split())

91

In [15]:
def word_count(article):
    return len(article.split())

In [25]:
feature_matrix['word_count'] = feature_matrix['article_A'].map(word_count)

In [30]:
feature_matrix = feature_matrix.loc[feature_matrix['word_count'] > 300].reset_index(drop = True)

In [31]:
feature_matrix.head()

Unnamed: 0,A,B,author_A,author_B,link_A,link_B,article_A,article_B,c_20,lcs_word,curtailment_prob,word_count
0,849,915,FOIN Official,Tomas Pueyo,https://medium.com/@foiniocommunitiy/foin-and-...,https://medium.com/@tomaspueyo/coronavirus-how...,FOIN and Coronavirus outbreak - FOIN Official ...,Coronavirus: How to Do Testing and Contact Tra...,0.011494,0.461538,0.7982,364
1,55,282,Dana G Smith,Ed Yong,https://coronavirus.medium.com/top-5-coronavir...,https://medium.com/the-atlantic/why-the-corona...,Top 5 Coronavirus Resources - Medium Coronavir...,Why the Coronavirus Is So Confusing - The Atla...,0.023102,0.434921,0.775028,315
2,70,169,David Paul Kirkpatrick,Florin Badita,https://medium.com/@davidpkirkpatrick/why-are-...,https://medium.com/@baditaflorin/why-some-of-u...,Why Are We Ignoring Farr’s Law of Epidemics? C...,Why some of us will die — A look into Coronavi...,0.021333,0.429306,0.763499,389
3,84,169,Michael Laitman,Florin Badita,https://medium.com/@michaellaitman/when-will-t...,https://medium.com/@baditaflorin/why-some-of-u...,When Will the Coronavirus Pandemic End? - Mich...,Why some of us will die — A look into Coronavi...,0.014028,0.434951,0.761356,515
4,849,872,FOIN Official,Tomas Pueyo,https://medium.com/@foiniocommunitiy/foin-and-...,https://medium.com/@tomaspueyo/coronavirus-the...,FOIN and Coronavirus outbreak - FOIN Official ...,Coronavirus: The Hammer and the Dance - Tomas ...,0.011494,0.436813,0.760453,364


In [32]:
for idx in range(5):
    print(f"Articles for suspect {idx}")
    print(feature_matrix.loc[idx, 'link_A'])
    print(feature_matrix.loc[idx, 'link_B'])
    print('\n')

Articles for suspect 0
https://medium.com/@foiniocommunitiy/foin-and-coronavirus-outbreak-2840f4a08a38?source=search_post
https://medium.com/@tomaspueyo/coronavirus-how-to-do-testing-and-contact-tracing-bde85b64072e?source=search_post


Articles for suspect 1
https://coronavirus.medium.com/top-5-coronavirus-resources-6790711e251c?source=search_post
https://medium.com/the-atlantic/why-the-coronavirus-is-so-confusing-20cf8f8995cb?source=search_post


Articles for suspect 2
https://medium.com/@davidpkirkpatrick/why-are-we-ignoring-farrs-law-of-epidemics-coronavirus-should-be-gone-by-summer-7782f3622c3a?source=search_post
https://medium.com/@baditaflorin/why-some-of-us-will-die-a-look-into-the-next-2020-pandemic-coronavirus-covid-19-b734513e95a0?source=search_post


Articles for suspect 3
https://medium.com/@michaellaitman/when-will-the-coronavirus-pandemic-end-86470c96a7cb?source=search_post
https://medium.com/@baditaflorin/why-some-of-us-will-die-a-look-into-the-next-2020-pandemic-corona

In [34]:
feature_matrix.sort_values('c_20', ascending = False)

Unnamed: 0,A,B,author_A,author_B,link_A,link_B,article_A,article_B,c_20,lcs_word,curtailment_prob,word_count
21967,169,630,Florin Badita,Sequoia,https://medium.com/@baditaflorin/why-some-of-u...,https://medium.com/sequoia-capital/coronavirus...,Why some of us will die — A look into Coronavi...,Coronavirus: The Black Swan of 2020 - Sequoia ...,0.078884,0.083867,0.309444,12484
199,213,270,Zoe Naz,sarah james,https://medium.com/slackjaw/the-millennial-res...,https://medium.com/slackjaw/are-you-trying-to-...,The Millennial Resilience Test: Will You Survi...,Are You Trying To Escape Reality By Watching T...,0.074699,0.294664,0.605308,431
263,213,217,Zoe Naz,Dan Ryan,https://medium.com/slackjaw/the-millennial-res...,https://medium.com/slackjaw/coronavirus-porn-6...,The Millennial Resilience Test: Will You Survi...,Coronavirus Porn - Slackjaw - MediumOpen in ap...,0.074699,0.287703,0.590425,431
221,306,947,Indi Samarajiva,Roxanne Khamsi,https://elemental.medium.com/forget-face-masks...,https://elemental.medium.com/what-it-takes-to-...,Why Face Masks Won't Protect You Against Coron...,What Scientists Are Doing to Kill the Coronavi...,0.073559,0.293204,0.599880,515
840,306,780,Indi Samarajiva,Robert Roy Britt,https://elemental.medium.com/forget-face-masks...,https://elemental.medium.com/what-the-coronavi...,Why Face Masks Won't Protect You Against Coron...,That Picture Of the Coronavirus Isn't Quite Ac...,0.069583,0.266019,0.532306,515
...,...,...,...,...,...,...,...,...,...,...,...,...
191128,457,642,Courtney Rubin,Robert Roy Britt,https://marker.medium.com/i-turned-down-a-4-mi...,https://elemental.medium.com/coronavirus-might...,I Turned Down a $4 Million Term Sheet. Then th...,"Coronavirus Might Attack the Brain, Too - Elem...",0.000000,0.127742,0.290247,775
191127,457,799,Courtney Rubin,Kelli María Korducki,https://marker.medium.com/i-turned-down-a-4-mi...,https://forge.medium.com/my-coronavirus-optimi...,I Turned Down a $4 Million Term Sheet. Then th...,Coronavirus: The Case for Optimism During Cris...,0.000000,0.127742,0.290247,775
191126,457,854,Courtney Rubin,Ayala Laufer-Cahana M.D.,https://marker.medium.com/i-turned-down-a-4-mi...,https://medium.com/@DrAyala/coronavirus-outbre...,I Turned Down a $4 Million Term Sheet. Then th...,Coronavirus Outbreak: Facts Help Fight Panic -...,0.000000,0.127742,0.290247,775
191125,827,832,"Herbert Dyer, Jr.",David Siegel,https://medium.com/@hdyerjr/white-supremacist-...,https://medium.com/datadriveninvestor/coronavi...,Domestic Terrorist Killed By FBI Before Settin...,Coronavirus Update: What to Do NOW - Data Driv...,0.000000,0.127743,0.290248,1777


Something wierd is going on with Rickyyuan lets ignore him

In [None]:
valid_comparison = feature_matrix.loc[~feature_matrix['author_A'].isin(['Rickyyuan'])].reset_index(drop = True)

In [None]:
for idx in range(5):
    print(f"Articles for suspect {idx}")
    print(valid_comparison.loc[idx, 'link_A'])
    print(valid_comparison.loc[idx, 'link_B'])
    print('\n')

still getting a load of short chinese articles 

In [None]:
valid_comparison.head()

In [None]:
valid_comparison = valid_comparison.loc[~feature_matrix['author_A'].isin(['Rickyyuan', 'Yeh James', 'takkii', 'Jackie Lo', 'Morris Tai', 'Unitech'])].reset_index(drop = True)
valid_comparison.head()

In [None]:
for idx in range(5):
    print(f"Articles for suspect {idx}")
    print(valid_comparison.loc[idx, 'link_A'])
    print(valid_comparison.loc[idx, 'link_B'])
    print('\n')

In [None]:
idx = 600
valid_comparison.loc[idx, 'article_A']

In [None]:
valid_comparison.loc[idx, 'article_B']

In [None]:
import re
re.search('Your journey starts here.How to Make this Moment the Turning Point', valid_comparison.loc[idx, 'article_B'])

In [None]:
valid_comparison['black_lives_A'] = valid_comparison['article_A'].str.contains('Your journey starts here.How to Make this Moment the Turning Point')
valid_comparison['black_lives_B'] = valid_comparison['article_B'].str.contains('Your journey starts here.How to Make this Moment the Turning Point')

In [None]:
import numpy as np
both_black_lives = (valid_comparison['black_lives_A'] is True) and (valid_comparison['black_lives_B'] is True)
both_black_lives[:5]

In [None]:
print(valid_comparison.shape)
valid_comparison = valid_comparison.loc[(valid_comparison['black_lives_A'] == False) & (valid_comparison['black_lives_A'] == False)].reset_index(drop = True)
valid_comparison.shape

In [None]:
valid_comparison.head()

In [None]:
for idx in range(5):
    print(f"Articles for suspect {idx}")
    print(valid_comparison.loc[idx, 'link_A'])
    print(valid_comparison.loc[idx, 'link_B'])
    print('\n')

# Most articles include the following text 
Your journey starts here.How to Make this Moment the Turning Point for Real ChangeMaintaining Professionalism In The Age of Black Death Is….A LotGENThe Psychopath in Chief5 Ways White People Can Take Action in Response to White and State-Sanctioned ViolenceAirbnbCrisp DmRandom ForestXgboostRegression

So we need to strip it out then go back to the drawing board

In [None]:
articles_clean = pickle.load(open(os.path.join(DATA_DIR, '_random_forest_articles_clean.p'), 'rb'))

In [None]:
type(articles_clean)

In [None]:
articles_clean.keys()

In [None]:
articles_clean['articles'][40]