In [16]:
import pandas as pd
import numpy as np
from datetime import date
import calendar
import plotly.io as pio
import plotly.express as px
import chart_studio.plotly as py

In [2]:
df = pd.read_csv("medium_NLP.csv")

In [3]:
df.head(2)

Unnamed: 0,Title,Subtitle,Image,Author,Publication,Year,Month,Day,Tag,Reading_Time,Claps,Comment,url,Author_url
0,How to build a State-of-the-Art Conversationa...,,1,Thomas Wolf,HuggingFace,2019,5,9,NLP,12,3.7K,0,https://medium.com/huggingface/how-to-build-a-...,https://medium.com/@Thomwolf?source=tag_archiv...
1,"Smaller, faster, cheaper, lighter: Introducin...",,1,Victor Sanh,HuggingFace,2019,8,28,NLP,10,3.1K,0,https://medium.com/huggingface/distilbert-8cf3...,https://medium.com/@victorsanh?source=tag_arch...


In [4]:
df.tail(2)

Unnamed: 0,Title,Subtitle,Image,Author,Publication,Year,Month,Day,Tag,Reading_Time,Claps,Comment,url,Author_url
3775,,,0,Nick Saraev,,2020,5,1,NLP,0,0,1,https://medium.com/voice-tech-global/conversat...,https://medium.com/@nick_wells?source=tag_arch...
3776,,,0,Nick Saraev,,2020,5,1,NLP,0,0,1,https://medium.com/@CobusGreyling/managing-use...,https://medium.com/@nick_wells?source=tag_arch...


In [5]:
df_clean = df.iloc[:,0:12]

In [7]:
df_clean.head(2)

Unnamed: 0,Title,Subtitle,Image,Author,Publication,Year,Month,Day,Tag,Reading_Time,Claps,Comment
0,How to build a State-of-the-Art Conversationa...,,1,Thomas Wolf,HuggingFace,2019,5,9,NLP,12,3.7K,0
1,"Smaller, faster, cheaper, lighter: Introducin...",,1,Victor Sanh,HuggingFace,2019,8,28,NLP,10,3.1K,0


In [6]:
df_clean['Date'] = pd.to_datetime(df[["Year","Month","Day"]])

In [8]:
my_date = date.today()
calendar.day_name[my_date.weekday()] 

'Monday'

In [11]:
pio.renderers.default = 'iframe'

In [16]:
np.unique(df_clean['Date'].dt.day_name())

array(['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday',
       'Wednesday'], dtype=object)

In [17]:

for template in ['ggplot2', 'seaborn', 'simple_white', 'plotly',
         'plotly_white', 'plotly_dark', 'presentation', 'xgridoff',
         'ygridoff', 'gridon', 'none']:
    fig = px.histogram(x=np.unique(df_clean['Date'].dt.day_name()) ,y = df_clean['Date'].dt.day_name().value_counts(),
                 template = template,
                #text = df_clean['Date'].dt.day_name().value_counts(),
                color =np.unique(df_clean['Date'].dt.day_name()),
                title= 'Day wise # of publications'
                )
    fig.show()

In [18]:
df_clean.dtypes

Title                   object
Subtitle                object
Image                    int64
Author                  object
Publication             object
Year                     int64
Month                    int64
Day                      int64
Tag                     object
Reading_Time             int64
Claps                   object
Comment                  int64
Date            datetime64[ns]
dtype: object

In [14]:
Score = pd.DataFrame()
Score['day'] = df_clean['Date'].dt.day_name()
Score['claps'] = df_clean['Claps']

In [18]:
fig = px.histogram(Score, x='day' ,y = 'claps',
                 template = template,histfunc='avg',
                #text = df_clean['Claps'],
                color = 'day'
                #title= 'claps'
                  
                ,labels={'x':'Day', 'y':'Mean of Claps'})
fig.update_layout(xaxis_type='category',xaxis={'categoryorder':'total descending'},title = 'Average Claps Based on Release Day')
fig.update_layout(showlegend=False)

py.iplot(fig, filename='Avg_Claps_Day')


In [27]:
units= {"K":1000,"M":1000000}
result=[]
for n in df_clean['Claps']:
        try:
            result.append( float(n) )  #try to comber it to a number
        except ValueError:
            unit=n[-1]                 #get the letter
            n = float( n[:-1] )        #convert all but the letter
            result.append( n * units[unit] )


In [28]:
df_clean['Claps'] = result

In [29]:
fig = px.histogram(df_clean, x='Claps',
                  #color="day", 
                   #facet_col="day",
                   nbins=10,
                  log_y=True)

#fig.update_layout(xaxis_type='category',xaxis={'categoryorder':'total descending'})

fig.show()

In [30]:
over_10_claps = ['more' if claps > 10 else 'less' for claps in df_clean['Claps']]


In [31]:
criteria = [df_clean['Claps'].between(0, 10), df_clean['Claps'].between(11, 100), df_clean['Claps'].between(101, 4000)]
values = ['0-5', '6-10', '>10']

df_clean['Clap_Range'] = np.select(criteria, values, 0)

In [33]:
fig = px.histogram(df_clean, x='Clap_Range'
                   #facet_col="day",
                #nbins=2
            )

#fig.update_layout(xaxis_type='category',xaxis={'categoryorder':'total descending'})

fig.update_layout(title = 'Claps Range')

py.iplot(fig, filename='Claps_Range')


In [31]:
pub = ['no' if Publication != Publication else 'Yes' for Publication in df_clean['Publication']]

In [32]:
print(df_clean['Publication'][:5])
pub[:5]

0         HuggingFace
1         HuggingFace
2        Turing Talks
3    Rakuten RapidAPI
4                 NaN
Name: Publication, dtype: object


['Yes', 'Yes', 'Yes', 'Yes', 'no']

In [212]:
Score.head(2)

Unnamed: 0,day,claps
0,Thursday,3700.0
1,Wednesday,3100.0


In [205]:
import chart_studio
chart_studio.tools.set_credentials_file(username='tej_bat', api_key='xxxxxxxxxxx')

In [265]:
df_clean['month_']=df_clean['till_month'].astype('category')

In [38]:
df_clean.head(2)

Unnamed: 0,Title,Subtitle,Image,Author,Publication,Year,Month,Day,Tag,Reading_Time,Claps,Comment,Date,Clap_Range
0,How to build a State-of-the-Art Conversationa...,,1,Thomas Wolf,HuggingFace,2019,5,9,NLP,12,3700.0,0,2019-05-09,>10
1,"Smaller, faster, cheaper, lighter: Introducin...",,1,Victor Sanh,HuggingFace,2019,8,28,NLP,10,3100.0,0,2019-08-28,>10


In [249]:
df_clean['till_month'] = df_clean['Date'].dt.to_period('M')


In [42]:
print(max(df_clean['Date']))
print(min(df_clean['Date']))
print(max(df_clean['Date']) - min(df_clean['Date']))

2020-05-01 00:00:00
2019-05-01 00:00:00
366 days 00:00:00


In [46]:
date_from_df = (df_clean['Date']).sort_values()
date_from_df[:5]

2477   2019-05-01
2727   2019-05-01
3235   2019-05-01
1408   2019-05-01
473    2019-05-01
Name: Date, dtype: datetime64[ns]

In [157]:
day_age = []
day_age = (date_from_df - min(date_from_df)).astype('timedelta64[D]')

In [158]:
#day_age = day_age.astype(int)
day_age = (day_age/30).astype('int')

In [159]:
df_clean_sorted = df_clean.sort_values('Date')

In [160]:
df_clean_sorted['Day_Age'] = day_age

In [161]:
366/7

52.285714285714285

In [162]:
np.unique(day_age)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [163]:
day_age

2477     0
2727     0
3235     0
1408     0
473      0
        ..
1522    12
1317    12
722     12
2329    12
3776    12
Name: Date, Length: 3777, dtype: int64

In [164]:
df_clean_sorted.head(2)

Unnamed: 0,Title,Subtitle,Image,Author,Publication,Year,Month,Day,Tag,Reading_Time,Claps,Comment,Date,Clap_Range,Day_Age
2477,,,1,,,2019,5,1,NLP,6,2.0,0,2019-05-01,0-5,0
2727,I Googled for you: Natural Language Generation,Here are some resources to get you (me) starte...,1,Markos Giannopoulos,Human on Tech,2019,5,1,NLP,3,1.0,0,2019-05-01,0-5,0


In [165]:
df_clean_sorted_I = df_clean_sorted[['Reading_Time','Claps','Day_Age','Author',]]

In [175]:
fig = px.scatter(df_clean_sorted_I.iloc[2001:2500,], x="Reading_Time", y="Claps", animation_frame="Day_Age", animation_group="Author",
                 size="Claps", color="Author", hover_name="Author",size_max=100)
fig.update_layout(title = 'Day Wise Trend')

py.iplot(fig, filename='Day_Wise_Trend 2')

In [176]:
df_features = pd.read_csv('df_features.csv')

In [177]:
df_features.dtypes

Title                  object
Subtitle               object
Image                   int64
Author                 object
Publication            object
Year                    int64
Month                   int64
Day                     int64
Tag                    object
Reading_Time            int64
Claps                 float64
Comment                 int64
no_of_blockquotes       int64
no_of_bolded_text       int64
no_of_italics_text      int64
no_of_figures_text      int64
no_of_code_chunks       int64
compound_senti        float64
neg_senti             float64
neu_senti             float64
pos_senti             float64
dtype: object

In [182]:
df_features['Date'] = pd.to_datetime(df_features[["Year","Month","Day"]])

In [183]:
df_features = df_features.sort_values('Date')

In [186]:
df_features['week'] = df_features['Date'].dt.week

In [189]:
df_features.tail(100)

Unnamed: 0,Title,Subtitle,Image,Author,Publication,Year,Month,Day,Tag,Reading_Time,...,no_of_bolded_text,no_of_italics_text,no_of_figures_text,no_of_code_chunks,compound_senti,neg_senti,neu_senti,pos_senti,Date,week
44,Knowledge Graphs @ ICLR 2020,"Hello, I hope you are all doing well during t...",1,Michael Galkin,no publication,2020,4,27,NLP,15,...,39,65,15,0,0.9998,0.031,0.871,0.098,2020-04-27,18
884,Natural Language Processing and Its impact on ...,no subtitle,1,Jeevan Jamakayala,Analytics Vidhya,2020,4,27,NLP,3,...,2,0,2,0,0.9987,0.014,0.828,0.158,2020-04-27,18
377,SPIKE-CORD,SPIKE-CORD is a powerful new tool for searchin...,1,Yoav Goldberg,AI2 Blog,2020,4,27,NLP,7,...,16,11,3,0,0.9978,0.026,0.877,0.097,2020-04-27,18
1183,Alleviate Cold Start Problem in Song Recommend...,no subtitle,1,Zhenghao Tan,no publication,2020,4,27,NLP,6,...,0,0,8,0,0.9580,0.036,0.904,0.060,2020-04-27,18
846,AI April NLP Math Teacher Challenge,TLDR; Build a model that can perform automatic...,1,AaronAri) Bornstein,Microsoft Azure,2020,4,27,NLP,2,...,1,0,2,0,0.9571,0.042,0.797,0.161,2020-04-27,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
588,Deep Learning for Natural Language Processing ...,Reading Comprehension using Expo,1,"Yuefeng Zhang, PhD",Towards Data Science,2020,5,1,NLP,8,...,6,32,5,12,0.9874,0.000,0.963,0.037,2020-05-01,18
551,Top Free Machine Learning Courses With Certifi...,no subtitle,1,Python Learning,no publication,2020,5,1,NLP,1,...,0,0,1,0,0.8779,0.000,0.736,0.264,2020-05-01,18
459,Transformers: A curious case of attention,In this post we will go through the intricacie...,1,Abhishek Perambai,Analytics Vidhya,2020,5,1,NLP,7,...,13,5,8,0,0.9973,0.033,0.876,0.091,2020-05-01,18
871,Five Cool Python Libraries for Data Science,Check out these 5 cool Python libraries that t...,1,Arpit Bhushan Sharma,Machine Learning with ABS,2020,5,1,NLP,5,...,45,1,8,15,0.9971,0.031,0.807,0.162,2020-05-01,18


In [204]:
fig = px.scatter(df_features.head(1487), x="Reading_Time", y="Claps", animation_frame="week", animation_group="Author",
                 size="Claps", color="Publication", hover_name="Author",size_max=100)
fig.update_layout(title = 'Week Wise Trend')

py.iplot(fig, filename='Week_Wise_Trend')

In [264]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer= SentimentIntensityAnalyzer()

df_features_1['Title_compound_senti'] = [analyzer.polarity_scores(v)['compound'] for v in df_features_1['Title']]
df_features_1['Subtitle_compund_senti'] = [analyzer.polarity_scores(v)['compound'] for v in df_features_1['Subtitle']]


In [246]:
df_features_1['Publication'] = df_features_1['Publication'].apply(lambda x: 0 if x == 'no publication' else 1)

In [203]:
len(df_features)

1487

In [None]:
df_features_1 = df_features.drop(['Author','Year','Month','Day','Date','week'],inplace = False , axis = 1)

In [None]:
df_features_1.head(2)