In [36]:
import os
import pandas as pd
import numpy as np
from IPython.display import HTML

In [47]:
def sbs(*title_df_tuples):
    html = '<div style="display:flex">'
    for title,df in title_df_tuples:
        html += f'<div style="display:block"><h3>{title}</h3>'
        html += '<div style="margin-right: 5em">'
        html += df.to_html()
        html += '</div></div>'
    html += '</div>'
    display(HTML(html))

In [21]:
'''
0 -> normal medium article
    normal stuff
1 -> meta post eg. follow4follow OR earnings
    eg. how much I earned from Medium in the month of X
    eg. Let's help one another get to 100 followers!
'''
raw = []

for folder in ['normal', 'follow4follow', 'earnings']:
    folder_path = f'data/{folder}/'
    for filename in os.listdir(folder_path):
        filepath = folder_path + filename
        with open(filepath, 'r') as f:
            raw.append((f.read(), 0 if folder=='normal' else 1))

raw = pd.DataFrame(raw, columns=['article', 'target'])
raw

Unnamed: 0,article,target
0,Unpopular opinion.\n\nvia Photo by Christina M...,0
1,I Met a CEO in Business Class and Asked Him Ab...,0
2,How to Turn Journal Ramblings into Viral Artic...,0
3,18 Ways to Repurpose Content\nRepurposing is N...,0
4,The Importance of Code Ownership\nWell-owned c...,0
...,...,...
95,"My First Three Months On Medium Earned $2,600+...",1
96,My Medium Earnings Jumped Another $200 in One ...,1
97,How I Increased my Medium Earnings from $180 t...,1
98,My First Month On Medium — Earnings Reveal\nI ...,1


In [6]:
import re

def clean(article):
    article = re.sub(r'[ \n\t]+', ' ', article)
    return article.lower()

data = raw.copy()
data['article'] = [clean(a) for a in data['article']]

data

Unnamed: 0,article,target
0,unpopular opinion. via photo by christina mori...,0
1,i met a ceo in business class and asked him ab...,0
2,how to turn journal ramblings into viral artic...,0
3,18 ways to repurpose content repurposing is no...,0
4,the importance of code ownership well-owned co...,0
...,...,...
95,"my first three months on medium earned $2,600+...",1
96,my medium earnings jumped another $200 in one ...,1
97,how i increased my medium earnings from $180 t...,1
98,my first month on medium — earnings reveal i c...,1


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [32]:
x = data['article']
y = data['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

tv = TfidfVectorizer(ngram_range=(1,2), stop_words='english')

x_train_vec = tv.fit_transform(x_train)
x_test_vec = tv.transform(x_test)

x_train_vec.shape, x_test_vec.shape

(75,) (25,) (75,) (25,)


((75, 35020), (25, 35020))

# Logistic Regression

In [33]:
lr_model = LogisticRegression()
lr_model.fit(x_train_vec, y_train)
y_pred = lr_model.predict(x_test_vec)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        12
           1       1.00      0.92      0.96        13

    accuracy                           0.96        25
   macro avg       0.96      0.96      0.96        25
weighted avg       0.96      0.96      0.96        25



In [69]:
'''
Finding top N important words/bigrams/trigrams
'''
N = 20

vocab = {v:k for k,v in tv.vocabulary_.items()}
coefs = lr_model.coef_[0]
indexes = list(np.argsort(coefs))

bottom = pd.DataFrame([vocab[i] for i in indexes[:N]], columns=['words'])
bottom['coef'] = [coefs[i] for i in indexes[:N]]

top = pd.DataFrame([vocab[i] for i in indexes[-N:]], columns=['words'])
top['coef'] = [coefs[i] for i in indexes[-N:]]

sbs(('Normal articles', bottom), ('Follow4follow + earnings', top.sort_values(by='coef', ascending=False)))

Unnamed: 0,words,coef
0,code,-0.614002
1,google,-0.427375
2,team,-0.281406
3,data,-0.230436
4,tyrion,-0.219153
5,men,-0.202382
6,business,-0.200622
7,life,-0.195037
8,employees,-0.192113
9,job,-0.187742

Unnamed: 0,words,coef
19,medium,1.175608
18,followers,0.977908
17,100,0.724673
16,100 followers,0.713271
15,follow,0.646174
14,earnings,0.641769
13,articles,0.553276
12,writing,0.497428
11,month,0.472124
10,stories,0.451135


# Random Forest Classifier

In [59]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train_vec, y_train)
y_pred = rf_model.predict(x_test_vec)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        12

    accuracy                           1.00        25
   macro avg       1.00      1.00      1.00        25
weighted avg       1.00      1.00      1.00        25



In [76]:
N = 20
importances = rf_model.feature_importances_
indexes = list(np.argsort(importances))

vocab = {v:k for k,v in tv.vocabulary_.items()}
report = pd.DataFrame([vocab[i] for i in indexes[-N:]], columns=['words'])
report['importance'] = [importances[i] for i in indexes[-N:]]
report = report.sort_values(by='importance', ascending=False)

report

Unnamed: 0,words,importance
19,medium,0.026896
18,writing,0.021902
17,followers,0.014349
16,earnings,0.013863
15,article,0.011477
14,share,0.01046
13,time,0.009518
12,read,0.008323
11,writers,0.007927
10,earning,0.00785


# Testing the models with self-written data

In [89]:
test = [
    "Let's help each other! You follow me and I\'ll follow you!",
    "This is how much I've earned from Medium in the past 2 months",
    "Leave a comment down below asking others to follow you! Then pay if forward and follow them back",
    "This is how I doubled my Medium earnings in simply 2 months!",
    "Help me reach 100 followers and I'll help you reach 100 followers!",
    
    'who lives in a pineapple under the sea?',
    'How much wood can a wood chuck chuck if a wood chuck could chuck wood',
    'She sells seashells on the seashore',
    'apple orange pear pineapple durian',
]

test_labels = [1,1,1,1,1,0,0,0,0]

test_vec = tv.transform([clean(a) for a in test])

report = pd.DataFrame(test, columns=['text'])
report['prediction'] = lr_model.predict(test_vec)
report['actual'] = test_labels

HTML(report.to_html())

Unnamed: 0,text,prediction,actual
0,Let's help each other! You follow me and I'll follow you!,1,1
1,This is how much I've earned from Medium in the past 2 months,1,1
2,Leave a comment down below asking others to follow you! Then pay if forward and follow them back,1,1
3,This is how I doubled my Medium earnings in simply 2 months!,1,1
4,Help me reach 100 followers and I'll help you reach 100 followers!,1,1
5,who lives in a pineapple under the sea?,0,0
6,How much wood can a wood chuck chuck if a wood chuck could chuck wood,0,0
7,She sells seashells on the seashore,0,0
8,apple orange pear pineapple durian,0,0
