In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

df = pd.read_csv("../input/medium-2021-data-science-articles-dataset/medium-data-science-articles-2021.csv")

df.info()

/kaggle/input/medium-2021-data-science-articles-dataset/medium-data-science-articles-2021.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47660 entries, 0 to 47659
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   url           47660 non-null  object 
 1   title         47660 non-null  object 
 2   author        47660 non-null  object 
 3   author_page   47660 non-null  object 
 4   subtitle      13728 non-null  object 
 5   claps         47660 non-null  float64
 6   responses     47660 non-null  float64
 7   reading_time  47660 non-null  float64
 8   tag           47660 non-null  object 
 9   date          47660 non-null  object 
dtypes: float64(3), object(7)
memory usage: 3.6+ MB


In [2]:
df.shape

(47660, 10)

In [3]:
df.isna().sum()

url                 0
title               0
author              0
author_page         0
subtitle        33932
claps               0
responses           0
reading_time        0
tag                 0
date                0
dtype: int64

* The subtitle variable has 13728 values.
* All of the values in the DataFrame are strings, except for the claps, responses, reading_time variables.
* There are 47,660 articles present in the DataFrame.

## Analysis Plan ##

* Based on the data, I'm interested in performing sentiment analysis on the `title` variable, and statistical analysis on how sentiment varies between tags.


## NLP model ##

To accomplish this, I plan to include the following NLP tasks:

* Text preprocessing using regex and NLTK.
* Exploratory data analysis with pandas and seaborn.
* Sentiment analysis using bag-of-words and a Naive Bayes classifier.

## Text preprocessing tasks
* Noise Removal
    * Empty articles
    * Punctuation
    * Stopwords
    * URLs
    * HTML Tags

* Text Normalization 
    * Lower Casing

### Lower case title variable

In [4]:
# Sets all characters to lower case in Title
df["title_clean"] = df["title"].str.lower()
# View changes
df.head()

Unnamed: 0,url,title,author,author_page,subtitle,claps,responses,reading_time,tag,date,title_clean
0,https://medium.com/@dharmeshpanchmatia/data-an...,Data Analytics and AI/ML platform for eCommerce,dharmeshpanchmatia,https://medium.com/@dharmeshpanchmatia,Improve user pr,30.0,0.0,5.0,Big Data,2021-01-01,data analytics and ai/ml platform for ecommerce
1,https://medium.com/predict/the-journey-from-an...,The journey from an Apple Silicon M1 powered l...,dimitrisv,https://medium.com/@dimitrisv,,42.0,0.0,5.0,Big Data,2021-01-01,the journey from an apple silicon m1 powered l...
2,https://medium.com/@rajsaraogi/demystifying-kp...,Demystifying KPIs in Analytics,rajsaraogi,https://medium.com/@rajsaraogi,,19.0,0.0,3.0,Big Data,2021-01-01,demystifying kpis in analytics
3,https://medium.com/@bigdataschool/%D0%B1%D0%BE...,Большая разница: чем структурированная потоков...,bigdataschool,https://medium.com/@bigdataschool,,0.0,0.0,6.0,Big Data,2021-01-01,большая разница: чем структурированная потоков...
4,https://medium.com/@arnowa-44509/iot-a-new-way...,IoT: A New Way of Life,arnowa-44509,https://medium.com/@arnowa-44509,"IoT is more than just the Internet of things, ...",0.0,0.0,3.0,Big Data,2021-01-01,iot: a new way of life


### Remove punctuation

In [5]:
# Remove punctiation from Message variable
df["title_clean"] = df["title_clean"].str.replace('[^\w\s]','')
# View changes
df.head()

  


Unnamed: 0,url,title,author,author_page,subtitle,claps,responses,reading_time,tag,date,title_clean
0,https://medium.com/@dharmeshpanchmatia/data-an...,Data Analytics and AI/ML platform for eCommerce,dharmeshpanchmatia,https://medium.com/@dharmeshpanchmatia,Improve user pr,30.0,0.0,5.0,Big Data,2021-01-01,data analytics and aiml platform for ecommerce
1,https://medium.com/predict/the-journey-from-an...,The journey from an Apple Silicon M1 powered l...,dimitrisv,https://medium.com/@dimitrisv,,42.0,0.0,5.0,Big Data,2021-01-01,the journey from an apple silicon m1 powered l...
2,https://medium.com/@rajsaraogi/demystifying-kp...,Demystifying KPIs in Analytics,rajsaraogi,https://medium.com/@rajsaraogi,,19.0,0.0,3.0,Big Data,2021-01-01,demystifying kpis in analytics
3,https://medium.com/@bigdataschool/%D0%B1%D0%BE...,Большая разница: чем структурированная потоков...,bigdataschool,https://medium.com/@bigdataschool,,0.0,0.0,6.0,Big Data,2021-01-01,большая разница чем структурированная потокова...
4,https://medium.com/@arnowa-44509/iot-a-new-way...,IoT: A New Way of Life,arnowa-44509,https://medium.com/@arnowa-44509,"IoT is more than just the Internet of things, ...",0.0,0.0,3.0,Big Data,2021-01-01,iot a new way of life


### Removing stopwords

In [6]:
# Import NLTK library
import nltk

In [7]:
# Import stopwords
from nltk.corpus import stopwords

# View stopwords in english
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [8]:
# Import word_tokenize from nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
# Tokenize cleaned messages
df['title_tokenized'] = df.apply(lambda x: nltk.word_tokenize(x['title_clean']), axis=1)
df.head()

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


Unnamed: 0,url,title,author,author_page,subtitle,claps,responses,reading_time,tag,date,title_clean,title_tokenized
0,https://medium.com/@dharmeshpanchmatia/data-an...,Data Analytics and AI/ML platform for eCommerce,dharmeshpanchmatia,https://medium.com/@dharmeshpanchmatia,Improve user pr,30.0,0.0,5.0,Big Data,2021-01-01,data analytics and aiml platform for ecommerce,"[data, analytics, and, aiml, platform, for, ec..."
1,https://medium.com/predict/the-journey-from-an...,The journey from an Apple Silicon M1 powered l...,dimitrisv,https://medium.com/@dimitrisv,,42.0,0.0,5.0,Big Data,2021-01-01,the journey from an apple silicon m1 powered l...,"[the, journey, from, an, apple, silicon, m1, p..."
2,https://medium.com/@rajsaraogi/demystifying-kp...,Demystifying KPIs in Analytics,rajsaraogi,https://medium.com/@rajsaraogi,,19.0,0.0,3.0,Big Data,2021-01-01,demystifying kpis in analytics,"[demystifying, kpis, in, analytics]"
3,https://medium.com/@bigdataschool/%D0%B1%D0%BE...,Большая разница: чем структурированная потоков...,bigdataschool,https://medium.com/@bigdataschool,,0.0,0.0,6.0,Big Data,2021-01-01,большая разница чем структурированная потокова...,"[большая, разница, чем, структурированная, пот..."
4,https://medium.com/@arnowa-44509/iot-a-new-way...,IoT: A New Way of Life,arnowa-44509,https://medium.com/@arnowa-44509,"IoT is more than just the Internet of things, ...",0.0,0.0,3.0,Big Data,2021-01-01,iot a new way of life,"[iot, a, new, way, of, life]"


In [9]:
# Save the preprocessed DataFrame
df.to_csv('processed_clean_df.csv', header = True)