In [1]:
import numpy as np
import pandas as pd

# Data Preprocessing for `articles` file

In [2]:
articles = pd.read_json('raw_data/articles/articles_07_04_2021.json')

In [3]:
articles.sample(5)

Unnamed: 0,author,linkOfAuthorProfile,articleTitle,articleLink,postingTime,minToRead,recommendations,responses
6190,Mohan Gupta,https://towardsdatascience.com/@mohangupta13?s...,A Review of Named Entity Recognition (NER) Usi...,https://towardsdatascience.com/a-review-of-nam...,"Jul 9, 2018",11 min read,724,11 responses
31346,Charmaine Chui,https://towardsdatascience.com/@geek-cc?source...,Using Turf.js to Geocode coordinates with cust...,https://towardsdatascience.com/using-turf-js-t...,Jun 22,3 min read,1,
36150,German Osin,https://towardsdatascience.com/@gosin?source=c...,Feature Store as a Foundation for Machine Lear...,https://towardsdatascience.com/feature-store-a...,"Dec 10, 2020",12 min read,483,2 responses
8613,Federico Riveroll,https://towardsdatascience.com/@federicorivero...,Outstanding results predicting Apple Stock app...,https://towardsdatascience.com/making-a-contin...,"Feb 13, 2020",9 min read,1.5K,18 responses
20745,Shai Ardazi,https://towardsdatascience.com/@shaiardazi?sou...,Web scraping with Python — A to Z,https://towardsdatascience.com/web-scraping-wi...,"Feb 7, 2019",13 min read,298,3 responses


## Inspect missing columns

In [4]:
 # Define the function which checks missing data and types of data
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) 
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype) 
    tt['Types'] = types 
    return(np.transpose(tt))

In [5]:
missing_data(articles)

Unnamed: 0,author,linkOfAuthorProfile,articleTitle,articleLink,postingTime,minToRead,recommendations,responses
Total,0,0,1461,0,0,1,268,22235
Percent,0.0,0.0,3.307151,0.0,0.0,0.002264,0.606651,50.331621
Types,object,object,object,object,object,object,object,object


## There's only one entry that doesn't have `minToRead`. It turns out to be a navigation article. So I decided to drop it.

In [6]:
articles[articles['minToRead'].isnull()]

Unnamed: 0,author,linkOfAuthorProfile,articleTitle,articleLink,postingTime,minToRead,recommendations,responses
40490,TDS Editors,https://towardsdatascience.com/@towardsdatasci...,Navigation,https://towardsdatascience.com/navigation-1f82...,"Nov 14, 2020",,298,


In [7]:
articles = articles.dropna(subset=['minToRead'])

## Inspect `articleTitle` that has missing values

In [8]:
articles[articles['articleTitle'].isnull()].sample(5)

Unnamed: 0,author,linkOfAuthorProfile,articleTitle,articleLink,postingTime,minToRead,recommendations,responses
28876,Irfan Alghani Khalid,https://towardsdatascience.com/@irfanalghani11...,,https://towardsdatascience.com/this-is-how-i-w...,"May 12, 2020",4 min read,294,1 response
41025,Rose Day,https://towardsdatascience.com/@rjday?source=c...,,https://towardsdatascience.com/understanding-t...,"Nov 7, 2020",6 min read,48,
20910,Jo Stichbury,https://towardsdatascience.com/@fluffymaccoy?s...,,https://towardsdatascience.com/anzograph-a-w3c...,"Feb 8, 2019",7 min read,77,
24640,Oleksii Sheremet,https://towardsdatascience.com/@dynamic_phlox_...,,https://towardsdatascience.com/intersection-ov...,"Jul 24, 2020",3 min read,43,2 responses
40737,Sidney Kung,https://towardsdatascience.com/@sidneykung?sou...,,https://towardsdatascience.com/adapting-data-s...,"Nov 19, 2020",6 min read,192,


### I inspected the 14376th entry and I decided to fill the column `articleTitle`'s null entries with segments from `articleLink`

In [9]:
link = articles.loc[14376].articleLink
link

'https://towardsdatascience.com/data-science-powered-segmentation-models-ae89f9bd405f?source=collection_archive---------6-----------------------'

In [10]:
pt1 = link.split("?")[0]
pt1

'https://towardsdatascience.com/data-science-powered-segmentation-models-ae89f9bd405f'

In [11]:
pt2 = pt1.split("/")[-1]
pt2

'data-science-powered-segmentation-models-ae89f9bd405f'

In [12]:
pt3 = pt2.split("-")[:-1]
pt3

['data', 'science', 'powered', 'segmentation', 'models']

In [13]:
title = " ".join(pt3)
title

'data science powered segmentation models'

### Merge the above operations and fill null entries in `articleTitle`

In [14]:
articles['articleTitle'] = articles['articleTitle'].fillna(articles['articleLink'].apply(lambda x: " ".join(x.split("?")[0].split("/")[-1].split("-")[:-1])))

## Add `user_id` column with the same technique as above(find segments from `linkOfAuthorProfile`)

In [15]:
# example link that contains `user_id`
link = articles.loc[10].linkOfAuthorProfile
link

'https://towardsdatascience.com/@databeast?source=collection_archive---------3-----------------------'

In [16]:
link.split('?')[0].split('@')[-1]

'databeast'

In [17]:
articles['user_id'] = articles['linkOfAuthorProfile'].apply(lambda x: x.split('?')[0].split('@')[-1])

## `postingTime` 's format is either like "Aug 25, 2018" for dates before 2021 or "Jan 13" for dates after 2021.

In [18]:
# Convert this year's data format in "[Month] [day]" to "[Month] [day] [2021]"
def convert_date(x):
    if ',' not in x:
        x += ', 2021'
    return x

articles['postingTime'] = articles['postingTime'].apply(convert_date)

# Convert the data format in "[Month] [day] [year]" to datetime format
articles['postingTime'] = pd.to_datetime(articles['postingTime'], format='%b %d, %Y')

## `recommendations` columns have either under 1K(e.g. 221) or >=1K(e.g. 1.3K) or null values

In [19]:
# Fill the null entries in "recommendations" with "0"
articles['recommendations'].fillna('0', inplace=True)

In [20]:
# Format "3.4K" to "3400" and also transform from string to integer
def convert_recommendations(x):
    if x[-1] == 'K':
        x = int(float(x[:-1]) * 1000)
    else:
        x = int(x)
    return x

articles['recommendations'] = articles['recommendations'].apply(convert_recommendations)

## `responses` column has either null values or values' format like "2 responses"

In [21]:
# Fill the null entries in `responses` with "0 response"
articles['responses'].fillna('0 response', inplace=True)

In [22]:
# Extract the number and format from string to integer
articles['responses'] = articles['responses'].str.split(' ').str[0].astype(int)

## `minToRead` column has format "3 min read"

In [23]:
# Extract the number and format from string to integer
articles['minToRead'] = articles['minToRead'].str.split(' ').str[0].astype(int)

# Data Preprocessing for `users` file

In [24]:
profiles = pd.read_json('raw_data/users/users_07_04_2021.json')

In [25]:
len(profiles)

8000

## There are 11684 unique number of user ids collected from `articles` file but there are only 8000 users' profiles are collected

In [26]:
len(set(articles.user_id))

11684

## Select unique `user_id` and corresponding `author`

In [27]:
users = articles[["user_id", "author", "linkOfAuthorProfile"]]

In [28]:
users = users.drop_duplicates(subset=["user_id"])

In [29]:
len(users)

11684

## Because of the duplicated names, after merging, there are 100 more wrong entries

In [30]:
df = pd.merge(users, profiles, how="left", left_on="author", right_on="user_name")

In [31]:
df.sample(5)

Unnamed: 0,user_id,author,linkOfAuthorProfile,user_name,desc,followers
10719,srees1988,Sree,https://towardsdatascience.com/@srees1988?sour...,,,
5278,ivana-15022,Ivana Kotorchevikj,https://towardsdatascience.com/@ivana-15022?so...,,,
224,sethweidman,Seth Weidman,https://towardsdatascience.com/@sethweidman?so...,Seth Weidman,Became a data scientist to “use math to solve ...,992 Followers
7476,samdenlepcha,Samden Lepcha,https://towardsdatascience.com/@samdenlepcha?s...,,,
702,paulbradshaw,Paul Bradshaw,https://towardsdatascience.com/@paulbradshaw?s...,,,


In [32]:
len(df)

11748

## I inspected all the duplicated `user_name` and deleted the wrong entries(I only showed the first inspection and omitted the output of all the other inspections)

In [33]:
duplicated = profiles[profiles.duplicated(subset=['user_name'])].user_name
duplicated

607       Gagandeep Singh
2337        Aditya Sharma
2944       Abhishek Kumar
3339      Harshdeep Singh
3888          Ofer Tirosh
4006               Gaurav
4015       Harshit Sharma
4320           Shen Huang
4343            An Nguyen
4726           Salil Jain
4766        Pranjal Gupta
4881        Shubham Gupta
4899         Bruno Santos
5573          Sahil Gupta
5932          Phoebe Wong
5989          Ravi Ranjan
6252                James
6562       Abhishek Kumar
7038        Nishant Sinha
7059         Vishal Singh
7329          Manu Sharma
7358      Shekhar Koirala
7411           Nick Jones
7421               Justin
7434           Wendy Wong
7551            Jason Lee
7578              Sue Liu
7725            Christina
7776        Mayank Mishra
7887    Benjamin Peterson
7949        Shikhar Gupta
Name: user_name, dtype: object

In [34]:
df[df.author=='Gagandeep Singh']

Unnamed: 0,user_id,author,linkOfAuthorProfile,user_name,desc,followers
3233,gaganmanku96,Gagandeep Singh,https://towardsdatascience.com/@gaganmanku96?s...,Gagandeep Singh,Data Scientist at Zykrr. Geeky —,578 Followers
3234,gaganmanku96,Gagandeep Singh,https://towardsdatascience.com/@gaganmanku96?s...,Gagandeep Singh,Big Data Engineer at WooliesX,74 Followers
4395,singh.gagandeep8,Gagandeep Singh,https://towardsdatascience.com/@singh.gagandee...,Gagandeep Singh,Data Scientist at Zykrr. Geeky —,578 Followers
4396,singh.gagandeep8,Gagandeep Singh,https://towardsdatascience.com/@singh.gagandee...,Gagandeep Singh,Big Data Engineer at WooliesX,74 Followers


In [35]:
df = df.drop(index=[3234, 4395])

In [36]:
# df[df.author=='Aditya Sharma']

In [37]:
df = df.drop(index=[2474, 8620])

In [38]:
# df[df.author=='Abhishek Kumar']

In [39]:
df = df.drop(index=[6811, 6813, 2828, 4363, 4364, 2827, 2828])

In [40]:
# df[df.author=='Harshdeep Singh']

In [41]:
df = df.drop(index=[1180, 3498])

In [42]:
# df[df.author=='Ofer Tirosh']

In [43]:
df = df.drop(index=[3507, 7540])

## During inspections I also found some profiles' description wasn't collected so I filled them manually

In [44]:
df.loc[3506].desc = "CEO and Founder of Tomedes, a professional services provider to Fortune 500 companies around the world specializing in localization and translation."

In [45]:
# df[df.author=='Gaurav']

In [46]:
df = df.drop(index=[256, 3993])

In [47]:
df.loc[257].desc = "Editor of TapTechie Publication and Tech@Breno"

In [48]:
# df[df.author=='Harshit Sharma']

In [49]:
df = df.drop(index=[384, 830])

In [50]:
# df[df.author=='Shen Huang']

In [51]:
df = df.drop(index=[3779, 5970])

In [52]:
# df[df.author=='An Nguyen']

In [53]:
df = df.drop(index=[955, 6053])

In [54]:
# df[df.author=='Salil Jain']

In [55]:
df = df.drop(index=[328, 7938])

In [56]:
# df[df.author=='Pranjal Gupta']

In [57]:
df = df.drop(index=[6699, 8601])

In [58]:
# df[df.author=='Shubham Gupta']

In [59]:
df = df.drop(index=[8120, 8820])

In [60]:
# df[df.author=='Bruno Santos']

In [61]:
df = df.drop(index=[2628, 4611])

In [62]:
# df[df.author=='Sahil Gupta']

In [63]:
df = df.drop(index=[3909, 8190])

In [64]:
# df[df.author=='Phoebe Wong']

In [65]:
df = df.drop(index=[4299, 5801])

In [66]:
# df[df.author=='Ravi Ranjan']

In [67]:
df = df.drop(index=[938, 3237])

In [68]:
# df[df.author=='James']

In [69]:
df = df.drop(index=[331, 8613])

In [70]:
# df[df.author=='Nishant Sinha']

In [71]:
df = df.drop(index=[213, 2663])

In [72]:
# df[df.author=='Vishal Singh']

In [73]:
df = df.drop(index=[1002, 1623])

In [74]:
df.loc[1001].desc = 'Medium member since August 2020'

In [75]:
# df[df.author=='Manu Sharma']

In [76]:
df = df.drop(index=[4068, 5522])

In [77]:
# df[df.author=='Shekhar Koirala']

In [78]:
df = df.drop(index=[894, 1652])

In [79]:
# df[df.author=='Nick Jones']

In [80]:
df = df.drop(index=[1054, 2063])

In [81]:
# df[df.author=='Justin']

In [82]:
df = df.drop(index=[2371, 5415])

In [83]:
df.loc[2372].desc="Hello, world! My name is Justin. I solve problems using data. Check me out at embracingtherandom.com and linkedin.com/in/justin-m-evans/"

In [84]:
# df[df.author=='Wendy Wong']

In [85]:
df = df.drop(index=[1519, 4695])

In [86]:
# df[df.author=='Jason Lee']

In [87]:
df = df.drop(index=[3501, 8042])

In [88]:
# df[df.author=='Sue Liu']

In [89]:
df = df.drop(index=[1537, 7131])

In [90]:
# df[df.author=='Christina']

In [91]:
df = df.drop(index=[932, 8556])

In [92]:
# df[df.author=='Mayank Mishra']

In [93]:
df = df.drop(index=[7293, 7555])

In [94]:
# df[df.author=='Benjamin Peterson']

In [95]:
df = df.drop(index=[3511, 6446])

In [96]:
# df[df.author=='Shikhar Gupta']

In [97]:
df = df.drop(index=[47, 6058])

## Now there are no duplicated wrong entries!!! I also dropped the duplicated column `user_name`

In [98]:
df[df.duplicated(subset=["user_id"])]

Unnamed: 0,user_id,author,linkOfAuthorProfile,user_name,desc,followers


In [99]:
len(df)

11684

In [100]:
df = df.drop(columns="user_name")

## `followers` column has null values or that format "552 followers"

In [101]:
# Change the 'null' entries to '0 follower'
df['followers'].fillna('0 follower', inplace=True)

In [102]:
# Transform the format from "[num] follower(s)" to "num" in integer
df['followers'] = df['followers'].str.split(' ').str[0]

In [103]:
# Format "3.4K" to "3400" and convert string to integer
def convert_followers(x):
        if x[-1] == 'K':
            x = int(float(x[:-1]) * 1000)
        else:
            x = int(x)
        return x

df['followers'] = df['followers'].apply(convert_followers)

In [104]:
df.sample(5)

Unnamed: 0,user_id,author,linkOfAuthorProfile,desc,followers
9868,jeffrey-scholz,Jeffrey Scholz,https://towardsdatascience.com/@jeffrey-scholz...,,0
6702,dlite,Derek Haynes,https://towardsdatascience.com/@dlite?source=c...,Working on,336
1253,leofle,Lio Fleishman,https://towardsdatascience.com/@leofle?source=...,"Partnership Solutions Engineer at Sisense , th...",22
3366,tolaniadekoya,Tolani Adekoya,https://towardsdatascience.com/@tolaniadekoya?...,,0
11435,colefp,Cole,https://towardsdatascience.com/@colefp?source=...,,0


# Export the cleaned data to csv files

In [105]:
df.to_csv("cleaned_data/users/users_07_04_2021.csv", index=False)

In [106]:
articles = articles.drop(columns=["author", "linkOfAuthorProfile"])
articles.to_csv("cleaned_data/articles/articles_07_04_2021.csv", index=False)