# Investigate Plagiarism

In [1]:
import sys
sys.path.insert(1, '../') 

MODEL_DIR = '../model'
DATA_DIR = '../data'

# Load Model

In [2]:
from src.utils.model import BinaryClassifier
import os
import torch

def model_fn(model_dir):
    """Load the PyTorch model from the `model_dir` directory."""
    print("Loading model.")

    # First, load the parameters used to create the model.
    model_info = {}
    model_info_path = os.path.join(model_dir, 'model_info.pth')
    with open(model_info_path, 'rb') as f:
        model_info = torch.load(f)

    print("model_info: {}".format(model_info))

    # Determine the device and construct the model.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BinaryClassifier(model_info['input_features'], model_info['hidden_dim'], model_info['output_dim'])

    # Load the stored model parameters.
    model_path = os.path.join(model_dir, 'model.pth')
    with open(model_path, 'rb') as f:
        model.load_state_dict(torch.load(f))

    # set to eval mode, could use no_grad
    model.to(device).eval()

    print("Done loading model.")
    return model

In [3]:
model = model_fn(MODEL_DIR)

Loading model.
model_info: {'input_features': 2, 'hidden_dim': 7, 'output_dim': 1}
Done loading model.


In [4]:
# Provided predict function
def predict_fn(input_data, model):
    print('Predicting class probabilities for the input data...')

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Process input_data so that it is ready to be sent to our model.
    data = torch.from_numpy(input_data.astype('float32'))
    data = data.to(device)

    # Put the model into evaluation mode
    model.eval()

    # Predicted scores
    probabilities = model(data).cpu().detach().numpy()

    return probabilities

In [5]:
import os
import pickle

def predict_curtailment(search_term, min_length = 300):
    
    # load features
    feature_matrix = pickle.load(open(os.path.join(DATA_DIR, f"{search_term}_feature_matrix.p"), 'rb'))
    
    # remove short articles
    feature_matrix = feature_matrix[(feature_matrix['word_count_A']>min_length) & (feature_matrix['word_count_B'] > min_length)]
    
    # sort data by predicted curtailment probability
    feature_matrix['curtailment_prob'] = predict_fn(feature_matrix[['c_20', 'lcs_word']].values, model)
    feature_matrix.sort_values('curtailment_prob', ascending = False, inplace = True, ignore_index = True)
    
    return feature_matrix

In [6]:
def display_links(feature_matrix):
    for idx in range(5):
        print(f"Potential plagiarism: {idx}")
        print(feature_matrix.loc[idx, 'link_A'])
        print(feature_matrix.loc[idx, 'link_B'])
        print('\n')

# Logistic Regression

In [7]:
feature_matrix = predict_curtailment('logistic_regression')
feature_matrix.head()

Predicting class probabilities for the input data...


Unnamed: 0,A,B,author_A,author_B,link_A,link_B,article_A,article_B,c_20,lcs_word,word_count_A,word_count_B,curtailment_prob
0,107,108,Jingles (Hong Jing),Elenjubbas,https://towardsdatascience.com/why-linear-regr...,https://medium.com/@elenjubbas/linear-regressi...,Why Linear Regression is not suitable for Clas...,Linear Regression vs. Logistic Regression for ...,0.441468,0.510222,1125,600,0.993093
1,78,299,ROHITH RAMESH,Sarang Narkhede,https://medium.com/analytics-vidhya/logistic-r...,https://towardsdatascience.com/understanding-l...,Logistic Regression - Analytics Vidhya - Mediu...,Understanding Logistic Regression - Towards Da...,0.242424,0.442217,848,654,0.948336
2,152,187,Rajwrita Nath,Apoorva Agrawal,https://medium.com/@rajwritanath/logistic-regr...,https://medium.com/data-science-group-iitr/log...,Logistic Regression- the Theory and Code - Raj...,Logistic Regression. Simplified. - Data Scienc...,0.240994,0.395657,829,845,0.931613
3,182,189,Harshit Nigam,Animesh Agarwal,https://medium.com/essence-of-learning/learnin...,https://towardsdatascience.com/building-a-logi...,Learning Logistic Regression - Essence of Lear...,Building a Logistic Regression in Python - Tow...,0.09242,0.339979,953,1445,0.718867
4,175,191,Ayush Pant,suresh hp,https://towardsdatascience.com/introduction-to...,https://medium.com/@hpsuresh12345/logistic-reg...,Introduction to Logistic Regression - Towards ...,Logistic regression in Statistics and Machine ...,0.103862,0.290283,813,925,0.676288


In [8]:
display_links(feature_matrix)

Potential plagiarism: 0
https://towardsdatascience.com/why-linear-regression-is-not-suitable-for-binary-classification-c64457be8e28?source=search_post
https://medium.com/@elenjubbas/linear-regression-vs-logistic-regression-for-classification-tasks-b42f85487857?source=search_post


Potential plagiarism: 1
https://medium.com/analytics-vidhya/logistic-regression-f9845e1aca5e?source=search_post
https://towardsdatascience.com/understanding-logistic-regression-9b02c2aec102?source=search_post---------8


Potential plagiarism: 2
https://medium.com/@rajwritanath/logistic-regression-the-the-e8ed646e6a29?source=search_post
https://medium.com/data-science-group-iitr/logistic-regression-simplified-9b4efe801389?source=search_post---------6


Potential plagiarism: 3
https://medium.com/essence-of-learning/learning-logistic-regression-17d35a985813?source=search_post
https://towardsdatascience.com/building-a-logistic-regression-in-python-301d27367c24?source=search_post---------7


Potential plagiarism: 

# Naive Bayes

In [9]:
feature_matrix = predict_curtailment('naive_bayes')
feature_matrix.head()

Predicting class probabilities for the input data...


Unnamed: 0,A,B,author_A,author_B,link_A,link_B,article_A,article_B,c_20,lcs_word,word_count_A,word_count_B,curtailment_prob
0,93,196,Jahnavi Mahanta,RealityEngines.AI,https://medium.com/@mahjahnavi/natural-languag...,https://medium.com/reality-engines/natural-lan...,Natural Language Processing — An overview of k...,Natural Language Processing — An Overview of K...,0.591416,0.831041,2545,2251,0.999668
1,173,291,Hrishav kumar,HT,https://medium.com/@hrishavkmr/naive-bayes-in-...,https://medium.com/@hackares/naive-bayes-algor...,Naive Bayes in Machine Learning - Hrishav kuma...,Naive Bayes Algorithm - HT - MediumOpen in app...,0.278261,0.470361,776,1173,0.967819
2,22,286,Jinde Shubham,Victor Roman,https://medium.com/coinmonks/spam-detector-usi...,https://towardsdatascience.com/naive-bayes-int...,SPAM DETECTOR USING NAIVE BAYES - Coinmonks - ...,Naive Bayes Algorithm: Intuition and Implement...,0.214646,0.519052,1181,2468,0.958193
3,312,518,Akshay Chavan,Pradeepsingam,https://medium.com/@akshayc123/naive-bayes-cla...,https://medium.com/@pradeepsingam333/naive-bay...,Naive Bayes Classifier (NB) : - Akshay Chavan ...,Naive Bayes(NB) Classifier - Pradeepsingam - M...,0.214967,0.452203,1475,1342,0.938491
4,218,291,Gaurav Chauhan,HT,https://towardsdatascience.com/all-about-naive...,https://medium.com/@hackares/naive-bayes-algor...,All about Naive Bayes - Towards Data ScienceOp...,Naive Bayes Algorithm - HT - MediumOpen in app...,0.215592,0.406337,1073,1173,0.920527


In [10]:
display_links(feature_matrix)

Potential plagiarism: 0
https://medium.com/@mahjahnavi/natural-language-processing-an-overview-of-key-algorithms-and-their-evolution-2d9612d1f764?source=search_post
https://medium.com/reality-engines/natural-language-processing-an-overview-of-key-algorithms-and-their-evolution-3588d2cef90f?source=search_post


Potential plagiarism: 1
https://medium.com/@hrishavkmr/naive-bayes-in-machine-learning-5c0972340b76?source=search_post
https://medium.com/@hackares/naive-bayes-algorithm-e565daa89eb7?source=search_post


Potential plagiarism: 2
https://medium.com/coinmonks/spam-detector-using-naive-bayes-c22cc740e257?source=search_post
https://towardsdatascience.com/naive-bayes-intuition-and-implementation-ac328f9c9718?source=search_post


Potential plagiarism: 3
https://medium.com/@akshayc123/naive-bayes-classifier-nb-7429a1bdb2c0?source=search_post
https://medium.com/@pradeepsingam333/naive-bayes-nb-classifier-185f2ee5d840?source=search_post


Potential plagiarism: 4
https://towardsdatascience.

# Random Forest

In [11]:
feature_matrix = predict_curtailment('random_forest')
feature_matrix.head()

Predicting class probabilities for the input data...


Unnamed: 0,A,B,author_A,author_B,link_A,link_B,article_A,article_B,c_20,lcs_word,word_count_A,word_count_B,curtailment_prob
0,104,107,Ashutosh Dutt Mishra,Synced,https://medium.com/datadriveninvestor/ensemble...,https://medium.com/@Synced/how-random-forest-a...,Ensemble Learning and Random Forest - Data Dri...,How Random Forest Algorithm Works in Machine L...,0.06968,0.441818,550,1188,0.795648
1,78,153,Bhartendu Dubey,Krishni,https://medium.com/@bhartendudubey/random-fore...,https://medium.com/datadriveninvestor/random-f...,Random Forest Regression - Bhartendu Dubey - M...,A Beginners Guide to Random Forest Regression ...,0.097859,0.383481,339,765,0.778037
2,156,258,Julia Kho,Gopal Singh,https://towardsdatascience.com/why-random-fore...,https://medium.com/data-science-bridge/random-...,Why Random Forest is My Favorite Machine Learn...,Random Forest Regression - Data Science Bridge...,0.149805,0.296332,1036,850,0.765955
3,63,136,Terence S,Hiromi Suenaga,https://medium.com/@terenceshin/52-weeks-of-da...,https://medium.com/@hiromi_suenaga/machine-lea...,52 Weeks of Data Science: My First Machine Lea...,Machine Learning 1: Lesson 6 - Hiromi Suenaga ...,0.0,0.419958,481,12003,0.645843
4,78,183,Bhartendu Dubey,Hiromi Suenaga,https://medium.com/@bhartendudubey/random-fore...,https://medium.com/@hiromi_suenaga/machine-lea...,Random Forest Regression - Bhartendu Dubey - M...,Machine Learning 1: Lesson 4 - Hiromi Suenaga ...,0.0,0.418879,339,11456,0.644328


In [12]:
display_links(feature_matrix)

Potential plagiarism: 0
https://medium.com/datadriveninvestor/ensemble-learning-and-random-forest-7430ebf3da7e?source=search_post
https://medium.com/@Synced/how-random-forest-algorithm-works-in-machine-learning-3c0fe15b6674?source=search_post


Potential plagiarism: 1
https://medium.com/@bhartendudubey/random-forest-regression-d751df81cc54?source=search_post
https://medium.com/datadriveninvestor/random-forest-regression-9871bc9a25eb?source=search_post


Potential plagiarism: 2
https://towardsdatascience.com/why-random-forest-is-my-favorite-machine-learning-model-b97651fa3706?source=search_post
https://medium.com/data-science-bridge/random-forest-regression-ddfc88c92689?source=search_post


Potential plagiarism: 3
https://medium.com/@terenceshin/52-weeks-of-data-science-my-first-machine-learning-algorithm-a9d3df37aa0e?source=search_post
https://medium.com/@hiromi_suenaga/machine-learning-1-lesson-6-14bbb8180d49?source=search_post


Potential plagiarism: 4
https://medium.com/@bhartendudu

# Xgboost

In [13]:
feature_matrix = predict_curtailment('xgboost')
feature_matrix.head()

Predicting class probabilities for the input data...


Unnamed: 0,A,B,author_A,author_B,link_A,link_B,article_A,article_B,c_20,lcs_word,word_count_A,word_count_B,curtailment_prob
0,469,597,Shubham Goyal,Knoldus Inc.,https://towardsdatascience.com/boosting-perfor...,https://medium.com/@knoldus/machinex-boosting-...,Boosting performance with XGBoost - Towards Da...,MachineX: Boosting performance with XGBoost - ...,0.782609,0.908903,966,960,0.999955
1,84,354,Ahan M R,Steven Liu,https://medium.com/intel-student-ambassadors/b...,https://towardsdatascience.com/boosting-your-w...,Building Scalable Tree Boosting Methods- Tunin...,Boosting your way to the top with XGBoost 🚀 - ...,0.304183,0.472152,790,1452,0.974447
2,245,406,Sai Nikhilesh Kasturi,Madan Maram,https://towardsdatascience.com/lightgbm-vs-xgb...,https://medium.com/@madanmaram/xg-boost-for-be...,XGBOOST vs LightGBM: Which algorithm wins the ...,XG Boost For Begginers - Madan Maram - MediumO...,0.214356,0.374938,2003,1402,0.904303
3,79,296,Brian Ho,Jdsmith,https://medium.com/@brian.ho_44743/churn-model...,https://medium.com/@jdsmith1906/how-i-fell-in-...,Churn Modeling using Deep Convolution Neural N...,How I Fell in Love with XGBoost Algorithm - Jd...,0.111842,0.377709,323,750,0.79318
4,128,245,Megha Singhal,Sai Nikhilesh Kasturi,https://medium.com/analytics-vidhya/xgboost-th...,https://towardsdatascience.com/lightgbm-vs-xgb...,XGBoost : “Thousand forests is in one acorn” -...,XGBOOST vs LightGBM: Which algorithm wins the ...,0.115942,0.352861,734,2003,0.773579


In [14]:
display_links(feature_matrix)

Potential plagiarism: 0
https://towardsdatascience.com/boosting-performance-with-xgboost-b4a8deadede7?source=search_post
https://medium.com/@knoldus/machinex-boosting-performance-with-xgboost-28c9f49998a6?source=search_post


Potential plagiarism: 1
https://medium.com/intel-student-ambassadors/building-scalable-tree-boosting-methods-tuning-of-parameters-2427adb8e958?source=search_post
https://towardsdatascience.com/boosting-your-way-to-the-top-with-xgboost-556fbe6b96d3?source=search_post


Potential plagiarism: 2
https://towardsdatascience.com/lightgbm-vs-xgboost-which-algorithm-win-the-race-1ff7dd4917d?source=search_post
https://medium.com/@madanmaram/xg-boost-for-begginers-9163ee2ed96?source=search_post


Potential plagiarism: 3
https://medium.com/@brian.ho_44743/churn-modeling-using-deep-convolution-neural-networks-b7b70805aa63?source=search_post
https://medium.com/@jdsmith1906/how-i-fell-in-love-with-xgboost-algorithm-aa903d4d3a75?source=search_post


Potential plagiarism: 4
https: