# Load data and python libraries

In [1]:
# make plot s appear after the code cell
%matplotlib inline 

# data processing libraries
import pandas as pd

# display wider columns in pandas data frames where necessary
pd.set_option('max_colwidth',150)

# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#detect language
from langdetect import detect

# supporting libraries
import re
import pickle

In [2]:
# file location of the data
input_folder = './data/'
output_folder = './transition_files/'

file_name = 'all-the-news-2-1.csv'

In [3]:
# load data
df_data = pd.read_csv(input_folder + file_name, #file location
                      encoding = "ISO-8859-1", #deal with texts in different formats
                     )

# display first row of the data frame
print(df_data.shape)
df_data.head(1).T

  interactivity=interactivity, compiler=compiler, result=result)


(2688879, 12)


Unnamed: 0,0
Unnamed: 0,0
Unnamed: 0.1,0
date,2016-12-09 18:31:00
year,2016
month,12
day,9
author,Lee Drutman
title,We should take concerns about the health of liberal democracy seriously
article,"This post is part of Polyarchy, an independent blog produced by the political reform program at New America, a Washington think tank devoted to de..."
url,https://www.vox.com/polyarchy/2016/12/9/13898340/democracy-warning-signs


# Data Exploration and Cleaning

In [4]:
def check_data (df):
    """
    check data types of values in the pandas data frame and number of missing values
    
    input:
        df - as pandas data frame to analyze
    
    output:
        pandas data frame with column name, 
                                column data type, 
                                the data type of actual value in the column
                                number of missing values
                                value example
    """
    df_data_types = []
    actual_data_types = []
    num_missing = []
    values = []
    columns = list(df_data.columns)
    
    for column in df_data.columns:
        #selecting only non missing values in the column
        df_tmp = df[df[column].isnull() == False]
        
        #count number of missing values
        num_missing.append(len(df) - len(df_tmp))
        
        #getting column data type
        dtype = str(df_tmp[column].dtypes)
        df_data_types.append(dtype)
                
        #getting data type of an actual value
        actual_value = df_tmp[column].iloc[0]
        m = re.search("'.+'", str(type(actual_value)))
        if m:
            dtype = m.group(0)
        else:
            dtype =  ''   
        actual_data_types.append(dtype)
        values.append(actual_value)
        
    #create data frame with data types comparison
    df_result = pd.DataFrame({
                              'data type': df_data_types,
                              'actual data type': actual_data_types,
                              'number of missing values': num_missing,
                              '% of missing values': [round(n / len(df) * 100,2) for n in num_missing],
                              'value example': values
                             }, index=columns)
    return df_result

In [5]:
# checking data quality
print('datatype = "object" means the column has string and/or missing values in it.')
check_data(df_data)

datatype = "object" means the column has string and/or missing values in it.


Unnamed: 0,data type,actual data type,number of missing values,% of missing values,value example
Unnamed: 0,int64,'numpy.int64',0,0.0,0
Unnamed: 0.1,object,'int',0,0.0,0
date,object,'str',1,0.0,2016-12-09 18:31:00
year,object,'int',0,0.0,2016
month,float64,'numpy.float64',1,0.0,12
day,object,'int',0,0.0,9
author,object,'str',1021102,37.98,Lee Drutman
title,object,'str',38,0.0,We should take concerns about the health of liberal democracy seriously
article,object,'str',104714,3.89,"This post is part of Polyarchy, an independent blog produced by the political reform program at New America, a Washington think tank devoted to de..."
url,object,'str',12578,0.47,https://www.vox.com/polyarchy/2016/12/9/13898340/democracy-warning-signs


**NOTE:<br>We have >60% of articles that are assigned to some section in a paper. So we can use some for model validation.**

***
## Section data

In [6]:
print("NOTE: section data is noisy!")
print("Number of sections per publication:")
s = pd.DataFrame(df_data.groupby('publication')['section'].nunique())
s.describe(percentiles=[0.6,0.7,0.8,0.9,0.95]).T

NOTE: section data is noisy!
Number of sections per publication:


Unnamed: 0,count,mean,std,min,50%,60%,70%,80%,90%,95%,max
section,26.0,299.0,907.633891,0.0,0.5,28.0,57.5,158.0,652.0,1163.5,4496.0


In [7]:
print("Number of articles per section:")
s = pd.DataFrame(df_data.groupby('section')['article'].count())
s.describe(percentiles=[0.6,0.7,0.8,0.9,0.95]).T

Number of articles per section:


Unnamed: 0,count,mean,std,min,50%,60%,70%,80%,90%,95%,max
article,7509.0,233.507924,2852.148236,0.0,1.0,1.0,1.0,4.0,22.0,147.6,108650.0


In [8]:
#calculate number_of_sections per publication
df_data['number_of_sections'] = df_data.groupby('publication')['section'].transform("nunique")
df_data[['publication', 'section', 'number_of_sections']].head()

Unnamed: 0,publication,section,number_of_sections
0,Vox,,0.0
1,Business Insider,,0.0
2,Reuters,Davos,226.0
3,Reuters,World News,226.0
4,TMZ,,0.0


In [9]:
lower_thr = 10
upper_thr = 50
print("Take only publications with reasonable number of sections (%2d-%2d)"%(lower_thr, upper_thr))

df_test = df_data[(df_data['number_of_sections'] >= lower_thr) &
                  (df_data['number_of_sections'] <= upper_thr)
                 ]
print("Number of articles:", len(df_test), " out of", len(df_data))
print("\nPublications:", set(df_test['publication']))
print("\nSections:\n", set(df_test['section']))
s = pd.DataFrame(df_test.groupby('publication')['section'].nunique())
s.describe(percentiles=[0.6,0.7,0.8,0.9,0.95]).T

Take only publications with reasonable number of sections (10-50)
Number of articles: 182958  out of 2688879

Publications: {'People', 'Economist', 'Wired'}

Sections:
 {nan, 'prospero', 'photo', 'business', 'trends', 'financial-indicators', 'movies', 'opinion', 'pets', 'home', 'bartleby', 'privacy', 'uncategorized', 'environment', 'gulliver', 'beauty', 'country', 'economic-and-financial-indicators', 'the-americas', 'the-world-in', 'babies', 'essay', 'europe', 'fashion', 'obituary', 'gear', 'graphic-detail', 'archive', 'premium', 'bodies', 'real-estate', 'special-report', 'parents', 'test', 'magazine', 'democracy-in-america', 'lifestyle', 'economic-indicators', 'international', 'china', 'transportation', 'ideas', 'middle-east-and-africa', 'entertainment', 'speakers-corner', 'free-exchange', 'design', 'christmas-specials', 'economics-brief', 'science', 'tv', 'charlemagnes-notebook', 'style', 'theater', 'united-states', 'deals', 'gadget-lab-podcast', 'books', 'finance-and-economics', 'op

Unnamed: 0,count,mean,std,min,50%,60%,70%,80%,90%,95%,max
section,3.0,37.0,10.148892,28.0,35.0,37.6,40.2,42.8,45.4,46.7,48.0


**Sections to consider:**
- music
- culture 
- schools-brief
- business
- awards
- travel | outdoor
- sports
- real-estate
- politics
- tech
- economic-indicators | finance-and-economics | economic-and-financial-indicators |  
- health


etc. (needs to be discussed)

In [10]:
print("Number of articles per section IN SELECTED PUBLICATIONS:")
s = pd.DataFrame(df_test.groupby('section')['article'].count())
s.describe(percentiles=[0.6,0.7,0.8,0.9,0.95]).T

Number of articles per section IN SELECTED PUBLICATIONS:


Unnamed: 0,count,mean,std,min,50%,60%,70%,80%,90%,95%,max
article,108.0,1657.824074,3476.795855,0.0,268.0,540.2,1060.8,1990.0,4742.0,9108.05,24616.0


## Text quality

In [11]:
# calculate text length of each article in symbols
df_data["text_length"] = df_data['article'].fillna("").apply(len)

#look at descriptive statistic
pd.DataFrame(df_data["text_length"].describe(percentiles=[0.01,0.05,0.25,0.5,0.75,0.95,0.99])).T

Unnamed: 0,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max
text_length,2688879.0,3031.924927,3410.513377,0.0,0.0,161.0,1000.0,2264.0,3983.0,8280.0,14308.0,224981.0


In [12]:
#Example: text with less than 150 symbols
df_data[df_data["text_length"] < 150]['article'].iloc[0]

"  Check out more of Paige Mehrer's work on her Tumblr and website."

In [13]:
# delete reviews with length less than 10th percentile and more than 95th percentile of the sample
#since they are suspiciously short or long
#calculate length percentiles
pct10 = df_data["text_length"].quantile(0.10)
pct95 = df_data["text_length"].quantile(0.95)

print('minimum length: ',df_data["text_length"].min())
print('maximum  length: ',df_data["text_length"].max())
print('\n10th percentile: ', pct10, '\n95th percentile: ', pct95)

#delete suspicious values
print('\n\nData size before deletion: ', len(df_data))
df_data = df_data[(df_data["text_length"] >= pct10) & (df_data["text_length"] <= pct95)]
print('Data size after deletion:  ', len(df_data))

minimum length:  0
maximum  length:  224981

10th percentile:  328.0 
95th percentile:  8280.0


Data size before deletion:  2688879
Data size after deletion:   2285614


In [14]:
# look at descriptive statistics
print("Total number of observations: ", len(df_data))
pd.DataFrame(df_data["text_length"].describe(percentiles=[0.01,0.05,0.25,0.5,0.75,0.95,0.99])).T

Total number of observations:  2285614


Unnamed: 0,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max
text_length,2285614.0,2799.668501,1877.527771,328.0,362.0,505.0,1343.0,2388.0,3841.0,6684.0,7875.0,8280.0


**NOTE:** Articles with missing text were deleted from the data

In [15]:
#test languages of every 500th article
def define_language(df, column):
    # detect languages for each text in the column
    lang_list = []
    for i in range(len(df)):
        text = df[column].iloc[i]
        try:
            language = detect(text)
        except:
            language = "error"
        lang_list.append(language)

    return lang_list

#############################################################
df_test = df_data.iloc[::500]
df_test['article_language'] = define_language(df_test, "article") 
df_test['article_language'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


en    4569
es       2
fr       1
Name: article_language, dtype: int64

## Date

In [16]:
# get article posting date in python format where possible 
#NOTE: if string can not be converted it is replaced with missing value NaT
df_data['py_date'] = pd.to_datetime(df_data['date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_data[['date', 'py_date']].head()

Unnamed: 0,date,py_date
0,2016-12-09 18:31:00,2016-12-09 18:31:00
1,2016-10-07 21:26:46,2016-10-07 21:26:46
2,2018-01-26 00:00:00,2018-01-26 00:00:00
3,2019-06-27 00:00:00,2019-06-27 00:00:00
4,2016-01-27 00:00:00,2016-01-27 00:00:00


In [17]:
df_data['py_date'].describe()

count                 2285614
unique                 508348
top       2020-03-12 00:00:00
freq                     3019
first     2016-01-01 00:00:00
last      2020-04-02 00:00:00
Name: py_date, dtype: object

In [18]:
print("Number of articles per year:")
df_data['py_year'] = df_data['py_date'].dt.year
df_data['py_year'].value_counts()

Number of articles per year:


2019    564104
2017    546798
2016    522057
2018    486419
2020    166236
Name: py_year, dtype: int64

In [19]:
print("Number of articles per month:")
df_data['py_month'] = df_data['py_date'].dt.month
df_data['py_month'].value_counts().sort_index()

Number of articles per month:


1     219070
2     217288
3     247325
4     172133
5     184217
6     185828
7     182018
8     176892
9     176896
10    192772
11    179272
12    151903
Name: py_month, dtype: int64

In [20]:
print("Number of months per year:")
print(df_data.groupby("py_year")['py_month'].nunique().sort_index())

print("\nCovered months in 2020:", set(df_data[df_data['py_year'] == 2020]['py_month']))

Number of months per year:
py_year
2016    12
2017    12
2018    12
2019    12
2020     4
Name: py_month, dtype: int64

Covered months in 2020: {1, 2, 3, 4}


In [21]:
set(df_data[df_data['py_year'] == 2020]['py_month'])

{1, 2, 3, 4}

***
# Conclusions:
- We have enough data with predefined labels to test Topic Modeling algorithm.
- Only first 4 months are covered in 2020 (if it has any importance to capture Covid-19 news)
- There are non-English articles! (need to clean that)
