In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/medium-articles/medium_articles.csv


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# load dataset

In [3]:
data_dir = '/kaggle/input/medium-articles'
data_file = 'medium_articles.csv'
to_file = os.path.join(data_dir, data_file)
df = pd.read_csv(to_file)

# **understanding data**

In [4]:
df.head(10)

Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology..."
5,"Mentally, Young Adults Are Suffering Most From...","Mentally, Young Adults Are Suffering Most From...",https://medium.com/the-partnered-pen/mentally-...,['Ryan Fan'],2020-11-19 15:27:36.001000+00:00,"['Society', 'Mental Health', 'Health', 'Nonfic..."
6,How to Turn Your Popular Blog Series Into a Be...,How to Turn Your Popular Blog Series Into a Be...,https://frankmckinley.medium.com/want-to-turn-...,['Frank Mckinley'],2020-01-28 03:36:58.566000+00:00,"['Books', 'Entrepreneurship', 'Writing', 'Mark..."
7,Dr Faisal Dar — Pioneer of Liver Transplantati...,Dr Faisal Dar — Pioneer of Liver Transplantati...,https://medium.com/storyfest/dr-faisal-dar-pio...,['Fatima Arif'],2019-02-28 06:01:04.914000+00:00,"['People', 'Storyfest', 'Health', 'Pakistan', ..."
8,Sunlight — The Natural Supplement For Our Ment...,Sunlight — The Natural Supplement For Our Ment...,https://medium.com/wholistique/sunlight-the-na...,['Jerren Gan'],2020-12-16 10:47:37.171000+00:00,"['Self Improvement', 'Mental Health', 'Health'..."
9,Occam’s dice,Occam’s dice\n\nDistrusting biological metapho...,https://medium.com/the-spike/occams-dice-258aa...,['Kelly Clancy'],2018-08-22 14:55:56.262000+00:00,"['Machine Learning', 'Science', 'Neuroscience'..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192368 entries, 0 to 192367
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   title      192363 non-null  object
 1   text       192368 non-null  object
 2   url        192368 non-null  object
 3   authors    192368 non-null  object
 4   timestamp  192366 non-null  object
 5   tags       192368 non-null  object
dtypes: object(6)
memory usage: 8.8+ MB


In [6]:
df.drop(columns = ["url", "authors", "timestamp"], inplace = True)

In [7]:
df.drop_duplicates(keep = "first", inplace = True)
df.reset_index(inplace = True, drop = True)

In [8]:
df = df.head(5000)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   5000 non-null   object
 1   text    5000 non-null   object
 2   tags    5000 non-null   object
dtypes: object(3)
memory usage: 117.3+ KB


# handeling missing data

In [10]:
max_non_null_val = df["text"].count()
print("Percentage of missing values in each column:\n")
missing_percent = []
for col in df.columns:
  missing_percent.append(df[col].isnull().sum()/max_non_null_val*100)
  print(f"{col} : {df[col].isnull().sum()/max_non_null_val*100}%")

Percentage of missing values in each column:

title : 0.0%
text : 0.0%
tags : 0.0%


In [11]:
df.dropna(inplace = True)
df.reset_index(drop=True, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   5000 non-null   object
 1   text    5000 non-null   object
 2   tags    5000 non-null   object
dtypes: object(3)
memory usage: 117.3+ KB


#Loading the Spacy model


In [12]:
!pip install spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [13]:
import spacy
nlp = spacy.load("en_core_web_md")

##preprocessing the data

In [14]:
df["text"] = df["title"] + " " + df["text"] + " " + df["tags"]

In [15]:
def preprocess_text(text):
      doc = nlp(text)
      tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
      review = ' '.join(tokens)
      return review


In [16]:
corpus = df['text'].apply(preprocess_text)

# Creating bag of words model

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_cv = cv.fit_transform(corpus).toarray()

##Making an cosine similarity matrix

In [50]:
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_article(text, vectorizer, X, df, top_n=5):
    preprocessed_text = preprocess_text(text)
    X_text = vectorizer.transform([preprocessed_text])
    similarity_scores = cosine_similarity(X, X_text).flatten()
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    return df.iloc[top_indices], similarity_scores[top_indices], similarity_scores

In [51]:
similar_articles, scores, similarity_scores = get_similar_article(df["text"][0],cv, X_cv, df)
print(scores)
similar_articles

[1.         0.42523619 0.42173792 0.38316829 0.37724186]


Unnamed: 0,title,text,tags,cluster_tfidf,cluster_cv
0,Mental Note Vol. 24,Mental Note Vol. 24 Photo by Josh Riemer on Un...,"['Mental Health', 'Health', 'Psychology', 'Sci...",0,5
496,Let go of these things for a happier 2021,Let go of these things for a happier 2021 by: ...,"['Nonfiction', 'Psychology', 'Relationships', ...",0,6
4221,8 Life Lessons I’ve Learned at 40-Something Th...,8 Life Lessons I’ve Learned at 40-Something Th...,"['Life Lessons', 'Self Love', 'Aging', 'Self I...",0,6
220,Why Do I Feel Guilty When My Mental Health is ...,Why Do I Feel Guilty When My Mental Health is ...,"['Creativity', 'Mindfulness', 'Mental Health',...",6,6
2294,Are You Living Your Life with the End Result i...,Are You Living Your Life with the End Result i...,"['Motivation', 'Self', 'Psychology', 'Life Les...",0,6


# Creating TF-IDF model

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer()
X_tfidf = tv.fit_transform(corpus).toarray()

In [53]:
similar_articles, scores, similarity_scores = get_similar_article(df["text"][0], tv, X_tfidf, df)
print(scores)
similar_articles

[1.         0.20868122 0.20314881 0.19685895 0.19384693]


Unnamed: 0,title,text,tags,cluster_tfidf,cluster_cv
0,Mental Note Vol. 24,Mental Note Vol. 24 Photo by Josh Riemer on Un...,"['Mental Health', 'Health', 'Psychology', 'Sci...",0,5
2289,Don’t Try to be Anyone’s Rock if You’re a Pile...,Don’t Try to be Anyone’s Rock if You’re a Pile...,"['Mindfulness', 'Mental Health', 'Self Love', ...",0,5
2627,Asshole Astrology: Week of 21 December 2020,Asshole Astrology: Week of 21 December 2020 As...,"['Relationships', 'Self', 'Psychology', 'Life ...",1,6
4629,Christmas,Christmas Christmas\n\nWish\n\nPhoto by loly g...,"['Christmas', 'Nonfiction', 'Family', 'Spiritu...",1,5
1982,Bounce! A Therapist’s New Year Resolution for ...,Bounce! A Therapist’s New Year Resolution for ...,"['Resilience', 'Mental Health', 'Stress', 'Cor...",0,6


# Creating word vectorizer model

In [54]:
X=[]
for i in corpus:
  X.append(nlp(i).vector)


In [55]:
def get_similar_article_using_wordvector(text, nlp, X, df, top_n=5):
    preprocessed_text = preprocess_text(text)
    X_text = [nlp(preprocessed_text).vector]
    similarity_scores = cosine_similarity(X, X_text).flatten()
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    return df.iloc[top_indices], similarity_scores[top_indices], similarity_scores


In [56]:
similar_articles, scores, similarity_scores = get_similar_article_using_wordvector(df["text"][0], nlp, X, df)
print(scores)
similar_articles

[0.99999994 0.98310107 0.9809567  0.97989994 0.97987354]


Unnamed: 0,title,text,tags,cluster_tfidf,cluster_cv
0,Mental Note Vol. 24,Mental Note Vol. 24 Photo by Josh Riemer on Un...,"['Mental Health', 'Health', 'Psychology', 'Sci...",0,5
474,Our Brains Struggle to Process This Much Stress,Our Brains Struggle to Process This Much Stres...,"['Health', 'Depression', 'Mental Health', 'Psy...",0,6
4332,Experts at the End of the World,Experts at the End of the World Photo by J W o...,"['Covid 19', 'Mental Health', 'PTSD', 'Body', ...",0,6
3178,When Mental Illness Becomes a Friend,When Mental Illness Becomes a Friend I have re...,"['Relationships', 'Mental Health', 'Mental Ill...",0,6
2421,The 6-Week Void in My Identity,The 6-Week Void in My Identity Six Weeks\n\nTh...,"['Trauma', 'Mental Health', 'Self', 'Psycholog...",0,6


# Making a k-means clustering algorithm

In [64]:
from sklearn.cluster import KMeans

num_clusters = 10 
kmeans_cv = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster_cv'] = kmeans_cv.fit_predict(X_cv)

kmeans_tfidf = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster_tfidf'] = kmeans_tfidf.fit_predict(X_tfidf)

In [65]:
df["cluster_tfidf"]

0       0
1       1
2       1
3       1
4       1
       ..
4995    0
4996    0
4997    2
4998    8
4999    9
Name: cluster_tfidf, Length: 5000, dtype: int32

In [66]:
df["cluster_cv"]

0       5
1       5
2       5
3       5
4       5
       ..
4995    5
4996    6
4997    0
4998    0
4999    5
Name: cluster_cv, Length: 5000, dtype: int32

In [67]:
def get_similar_articles_kmeans_cv(text, vectorizer, kmeans, df, top_n=5):
    preprocessed_text = preprocess_text(text)
    X_text = vectorizer.transform([preprocessed_text])
    cluster_label = kmeans.predict(X_text)[0]
    cluster_indices = df[df['cluster_cv'] == cluster_label].index
    similar_articles = df.iloc[cluster_indices].head(top_n)
    
    return similar_articles

In [69]:
similar_articles_kmeans = get_similar_articles_kmeans(df["text"][0], cv, kmeans, df)
similar_articles_kmeans

Unnamed: 0,title,text,tags,cluster_tfidf,cluster_cv
0,Mental Note Vol. 24,Mental Note Vol. 24 Photo by Josh Riemer on Un...,"['Mental Health', 'Health', 'Psychology', 'Sci...",0,5
1,Your Brain On Coronavirus,Your Brain On Coronavirus Your Brain On Corona...,"['Mental Health', 'Coronavirus', 'Science', 'P...",1,5
2,Mind Your Nose,Mind Your Nose Mind Your Nose\n\nHow smell tra...,"['Biotechnology', 'Neuroscience', 'Brain', 'We...",1,5
3,The 4 Purposes of Dreams,The 4 Purposes of Dreams Passionate about the ...,"['Health', 'Neuroscience', 'Mental Health', 'P...",1,5
4,Surviving a Rod Through the Head,Surviving a Rod Through the Head You’ve heard ...,"['Brain', 'Health', 'Development', 'Psychology...",1,5


In [70]:
def get_similar_articles_kmeans_tv(text, vectorizer, kmeans, df, top_n=5):
    preprocessed_text = preprocess_text(text)
    X_text = vectorizer.transform([preprocessed_text])
    cluster_label = kmeans.predict(X_text)[0]
    cluster_indices = df[df['cluster_tfidf'] == cluster_label].index
    similar_articles = df.iloc[cluster_indices].head(top_n)
    
    return similar_articles

In [71]:
similar_articles_kmeans = get_similar_articles_kmeans(df["text"][0], tv, kmeans, df)
similar_articles_kmeans

Unnamed: 0,title,text,tags,cluster_tfidf,cluster_cv
0,Mental Note Vol. 24,Mental Note Vol. 24 Photo by Josh Riemer on Un...,"['Mental Health', 'Health', 'Psychology', 'Sci...",0,5
1,Your Brain On Coronavirus,Your Brain On Coronavirus Your Brain On Corona...,"['Mental Health', 'Coronavirus', 'Science', 'P...",1,5
2,Mind Your Nose,Mind Your Nose Mind Your Nose\n\nHow smell tra...,"['Biotechnology', 'Neuroscience', 'Brain', 'We...",1,5
3,The 4 Purposes of Dreams,The 4 Purposes of Dreams Passionate about the ...,"['Health', 'Neuroscience', 'Mental Health', 'P...",1,5
4,Surviving a Rod Through the Head,Surviving a Rod Through the Head You’ve heard ...,"['Brain', 'Health', 'Development', 'Psychology...",1,5


In [62]:
def get_similar_article_using_kmeans_cv_and_cosine(text, vectorizer, kmeans, X, df, top_n=5):
    preprocessed_text = preprocess_text(text)
    X_text = vectorizer.transform([preprocessed_text])  
    cluster_label = kmeans.predict(X_text)[0]  
    cluster_indices = df[df['cluster_cv'] == cluster_label].index 
    
    X_cluster = vectorizer.transform(df.loc[cluster_indices, 'text'])  
    
    similarity_scores = cosine_similarity(X_cluster, X_text).flatten() 
    top_n = min(top_n, len(similarity_scores))  
    top_cluster_indices = cluster_indices[similarity_scores.argsort()[-top_n:][::-1]] 
    
    return df.iloc[top_cluster_indices], similarity_scores[similarity_scores.argsort()[-top_n:][::-1]], similarity_scores


In [63]:
similar_articles, scores, similarity_scores = get_similar_article_using_kmeans_and_cosine(df["text"][0], cv, kmeans, X, df)
print(scores)
similar_articles

[0.34151786 0.23538329 0.23412939 0.22197103 0.20203051]


Unnamed: 0,title,text,tags,cluster_tfidf,cluster_cv
0,Mental Note Vol. 24,Mental Note Vol. 24 Photo by Josh Riemer on Un...,"['Mental Health', 'Health', 'Psychology', 'Sci...",0,5
4118,Why Dropping Out of School Will Make Your Life...,Why Dropping Out of School Will Make Your Life...,"['Life Lessons', 'Self Improvement', 'Writing'...",0,5
4926,Improving Your Mental Health Begins By Talking...,Improving Your Mental Health Begins By Talking...,"['Relationships', 'Life Lessons', 'Mental Heal...",0,5
1058,"Have a Happy, Healthy Holiday!","Have a Happy, Healthy Holiday! Have a Happy, H...","['Holidays', 'Health', 'Mental Health', 'Chris...",1,5
1946,Dieting Addiction,Dieting Addiction Dieting Addiction\n\nA poem ...,"['Poetry', 'Life', 'Mental Health', 'Self Impr...",0,5


In [72]:
def get_similar_article_using_kmeans_tv_and_cosine(text, vectorizer, kmeans, X, df, top_n=5):
    preprocessed_text = preprocess_text(text)
    X_text = vectorizer.transform([preprocessed_text])  
    cluster_label = kmeans.predict(X_text)[0]  
    cluster_indices = df[df['cluster_tfidf'] == cluster_label].index 
    
    X_cluster = vectorizer.transform(df.loc[cluster_indices, 'text'])  
    
    similarity_scores = cosine_similarity(X_cluster, X_text).flatten() 
    top_n = min(top_n, len(similarity_scores))  
    top_cluster_indices = cluster_indices[similarity_scores.argsort()[-top_n:][::-1]] 
    
    return df.iloc[top_cluster_indices], similarity_scores[similarity_scores.argsort()[-top_n:][::-1]], similarity_scores


In [73]:
similar_articles, scores, similarity_scores = get_similar_article_using_kmeans_and_cosine(df["text"][0], tv, kmeans, X, df)
print(scores)
similar_articles

[0.20864714 0.12961765 0.12189375 0.08365108 0.08068601]


Unnamed: 0,title,text,tags,cluster_tfidf,cluster_cv
0,Mental Note Vol. 24,Mental Note Vol. 24 Photo by Josh Riemer on Un...,"['Mental Health', 'Health', 'Psychology', 'Sci...",0,5
1058,"Have a Happy, Healthy Holiday!","Have a Happy, Healthy Holiday! Have a Happy, H...","['Holidays', 'Health', 'Mental Health', 'Chris...",1,5
4629,Christmas,Christmas Christmas\n\nWish\n\nPhoto by loly g...,"['Christmas', 'Nonfiction', 'Family', 'Spiritu...",1,5
1796,The One Good Thing About The Worst Christmas S...,The One Good Thing About The Worst Christmas S...,"['Pop Culture', 'International', 'Christmas', ...",1,5
2463,Personalised Christmas Cards With Unique Promo...,Personalised Christmas Cards With Unique Promo...,"['Christmas', 'Sales', 'Marketing', 'Ecommerce...",1,5
