In [None]:
# install the necessary libraries and modules

# news_extract to extract news articles from Factiva to data frame
!pip install news_extract

# wordcloud to create wordcloud from text articles and headlines
!pip install wordcloud

# pycountry to create a list of countries
!pip install pycountry 

# vaderSentiment to sentiment analysis with Vader library
!pip install vaderSentiment

In [None]:
import os
import pandas as pd
import numpy as np
import news_extract as ne
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline
import ipywidgets as widgets
import pycountry
import seaborn as sns
from afinn import Afinn
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## WordCloud

In [None]:
# module news_extract extracts articles and associated metadata from pre-exported output files
# all pre-exported files are stored in the folder 'Factiva_new'
files = os.listdir('C:/Users/cvirova-t/Factiva_new')

full_df = pd.DataFrame()

# in a loop for each file in the folder 'Factiva_new', create a list, 
#then turn it into a DataFrame and add to the general DataFrame with all files
for f in files:
    fc_data = ne.factiva_extract("C:/Users/cvirova-t/Factiva_new/" + f)
    df = pd.DataFrame(fc_data)
    full_df = pd.concat([full_df, df], sort = False)

In [None]:
# select columns from the general data frame to work with 
# 'HD' is headings, 'TXT' is texts of articles, 'PD' is publication date,
# 'SN' is publidhers, 'NS' is topics, 'RE' is countries
work_df = full_df[['HD', 'PD', 'SN', 'TXT', 'NS', 'RE']]

In [None]:
# create a widget so that the user can select a year: 2019 or 2020
# and the type of text: texts of articles or headings
choose_year = widgets.Dropdown(options = ['2019', '2020'], 
                          value = '2019', 
                          description = 'Year: ', 
                          disabled = False)
choose_text = widgets.Dropdown(options = [('Texts', 'TXT'), ('Headings', 'HD')], 
                          value = 'TXT', 
                          description = 'Type: ', 
                          disabled = False)
display(choose_year)
display(choose_text)

In [None]:
# create a list of all texts for wordcloud module
# use the values received from the user to sort the columns
test_df = work_df.copy()
test_df.dropna(inplace = True)
get_year = work_df[work_df['PD'].str.contains(choose_year.value)]
all_texts = " ".join(get_year[choose_text.value].values)
all_texts = all_texts.lower()

In [None]:
# create a list of positive words 
df_positive = pd.read_table('C:/Users/cvirova-t/positive-words.txt') 
clear_positive = df_positive.iloc[34:, 0] 
positive_words = list(clear_positive.values)

In [None]:
# create a list of negative words
df_negative = pd.read_table('C:/Users/cvirova-t/negative-words.txt') 
clear_negative = df_negative.iloc[35:, 0] 
negative_words = list(clear_negative.values)

In [None]:
STOPWORDS.update({'millennial', 'millennials', 'generation', 
                  'gen', 'us', 'per cent', 'per', 'cent', 
                  'age', 'year', 'make', 'percent', 'according', 
                  'one', 'said', 'say', 'among', 'three',
                 'gen Y', 'generation Y', 'will', 'two'})

In [None]:
def my_color_func(word, font_size, position, orientation, random_state = None,
                    **kwargs):
    """
    Parameter: word is the word in wordCloud.
    Returns the color for word in wordCloud.
    """
    positive = positive_words
    negative = negative_words
    if word in positive:
        col = 'forestgreen'
    elif word in negative:
        col = 'crimson'
    else:
        col = 'silver'
    return col

In [None]:
def my_color_func2(word, font_size, position, orientation, random_state = None,
                    **kwargs):
    if afinn.score(word) > 0:
        col = 'forestgreen'
    elif afinn.score(word) < 0:
        col = 'crimson'
    else:
        col = 'silver'
    return col

In [None]:
# create a wordcloud with 50 most popular words of all articles
wordcloud = WordCloud(background_color = 'white', 
                     max_words = 50,
                     stopwords = STOPWORDS,
                     width = 2000,
                     height = 1200,
                     font_step = 1).generate(all_texts)

# customize the word output
plt.figure(figsize = [15, 15])
default_colors = wordcloud.to_array()
plt.imshow(wordcloud.recolor(color_func = my_color_func, random_state = 3),
           interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
# save the picture to png file
wordcloud.to_file(f'wordcoud_{choose_text.value}_{choose_year.value}.png')

## Quantitative analysis

In [None]:
# create a list of unique topics

# create a list of unique values from dataframe column 'NS'
NS = work_df[['NS']].copy()
NS.dropna(inplace = True)
NS_list = list(NS['NS'].unique())

# separate topics for each article from each other 
NS_split = []
for i in NS_list: 
    NS_split.extend(i.split('|'))
NS_dict = {k:v for k,v in (x.split(':') for x in NS_split)}

# clear values, add unique ones to the empty list
topics = []
for i in NS_dict.values():
    i = i.replace('\n', ' ').strip()
    if i not in topics:
        topics.append(i)
topics.sort()

In [None]:
# create a list of unique publishers
publishers = work_df['SN'].unique()

In [None]:
# create a list of unique countries
countries = []
for i in list(pycountry.countries):
    countries.append(i.name)

In [None]:
# create a list of unique companies
with open('C:/Users/cvirova-t/comps.csv', 'r') as fileObj:
    contents = fileObj.read().replace('\n', ',')
contents
comps = contents.split(',')
comps_clean = []
for comp in comps:
    clean = comp.strip()
    comps_clean.append(clean)
comps_clean.append('Netflix')

In [None]:
def count_uniques(dataframe, column_to_find, column_year, 
                  list_of_uniques, year, total_number):
    """
    Parameters: dataframe is the name of dataframe to work with,
                column_to_find is the name of the column where to find unique values,
                column_year is the name of the column with publication date, 
                list_of_uniques is the name of list with unique values, 
                year is the string,
                total_number is the number, min value.
    Returns a new dataframe.
    """
    test_array = np.zeros(len(list_of_uniques), dtype = np.int64)
    get_year = dataframe[dataframe[column_year].str.contains(year)]
    
    for el in get_year[column_to_find]:
        if pd.isnull(el):
            continue
        for index in range(len(list_of_uniques)):
            if list_of_uniques[index] in el:
                test_array[index] += 1
                
    test_df = pd.DataFrame(list(zip(list_of_uniques, test_array)))
    test_df = test_df[test_df[1] > total_number] 
    test_df = test_df.sort_values(by = [1], ascending = False)
    
    return test_df

In [None]:
def Barplot(dataframe, column_for_x, column_for_y, 
           name_x_axis, name_y_axis):
    """
    Parameters: dataframe is the name of dataframe to work with,
                column_for_x is the name of the column to be displayed on the x axis, 
                column_for_y is the name of the column to be displayed on the y axis,
                name_x_axis is the name of the x axis,
                name_y_axis is the name of the y axis.
    Returns a barplot.
    """
    sns.set_style("whitegrid")
    x = dataframe[column_for_x]
    y = dataframe[column_for_y]
    
    plt.figure(figsize = (15, 5))
    plt.xticks(rotation = 90)
    graph = sns.barplot(x, y, color = 'forestgreen')
    
    plt.ylabel(name_y_axis)
    plt.xlabel(name_x_axis)
    
    return graph

In [None]:
# create a widget so that the user can select a year: 2019 or 2020
# and the category: publishers, topics or countries
choose_year2 = widgets.Dropdown(options = ['2019', '2020'], 
                          value = '2019', 
                          description = 'Year: ', 
                          disabled = False)
choose_category = widgets.Dropdown(options = ['Publishers', 'Topics', 'Countries'], 
                          value = 'Publishers', 
                          description = 'Category: ', 
                          disabled = False)
display(choose_year2)
display(choose_category)

In [None]:
if choose_year2.value == '2019':
    if choose_category.value == 'Publishers':
        publishers_values_2019 = count_uniques(work_df, 'SN', 'PD', publishers, '2019', 30)
        publishers_values_2019.drop(59, inplace = True)
        sns_plot = Barplot(publishers_values_2019, 0, 1, 'Publishers', 'Number of articles')
    elif choose_category.value == 'Topics':
        topics_values_2019 = count_uniques(work_df, 'NS', 'PD', topics, '2019', 70)
        #topics_values_2019.drop([59, 126, 34, 33], inplace = True)
        sns_plot = Barplot(topics_values_2019, 0, 1, 'Topics', 'Number of articles')
    elif choose_category.value == 'Countries':
        countries_values_2019 = count_uniques(work_df, 'RE', 'PD', countries, '2019', 50)
        sns_plot = Barplot(countries_values_2019, 0, 1, 'Countries', 'Number of articles')

elif choose_year2.value == '2020':
    if choose_category.value == 'Publishers':
        publishers_values_2020 = count_uniques(work_df, 'SN', 'PD', publishers, '2020', 30)
        publishers_values_2020.drop(59, inplace = True)
        sns_plot = Barplot(publishers_values_2020, 0, 1, 'Publishers', 'Number of articles')
    elif choose_category.value == 'Topics':
        topics_values_2020 = count_uniques(work_df, 'NS', 'PD', topics, '2020', 70)
        #topics_values_2020.drop([59, 126, 34], inplace = True)
        sns_plot = Barplot(topics_values_2020, 0, 1, 'Topics', 'Number of articles')
    elif choose_category.value == 'Countries':
        countries_values_2020 = count_uniques(work_df, 'RE', 'PD', countries, '2020', 50)
        sns_plot = Barplot(countries_values_2020, 0, 1, 'Countries', 'Number of articles')

In [None]:
# save the barplot to png file
fig = sns_plot.get_figure()
fig.savefig(f'barplot_{choose_category.value}_{choose_year2.value}.png')

## Sentiment analysis

In [None]:
# choose language for Afinn library 
afinn = Afinn(language='en')

In [None]:
# applying the function for news texts in 'TXT'
work_df['AF_TXT'] = work_df['TXT'].apply(afinn.score)

In [None]:
# create hist plots to visualize Afinn scores
gist_df = work_df[(work_df['PD'].str.contains('2020')) & (work_df['NS'].str.contains('Health'))]
bins = np.arange(-20, 35, 5)
n, bins, patches = plt.hist(gist_df['AF_TXT'], bins = bins, histtype = 'bar', color = 'black')
sns.set(style = "darkgrid")

In [None]:
# setting up the y axis to reflect % of all news
centers = np.zeros_like(n)
for ind in range(len(bins) - 1):
    centers[ind] = np.mean([bins[ind], bins[ind + 1]])
plt.bar(centers, n / sum(n), width = 4.8, color = 'black')
plt.ylim(top = 0.5)
plt.savefig('graph_Y_A_HLT_2020.png', dpi = 300, format = 'png')

In [None]:
# % of positive and neutral news in all news
sum(n[centers >= 0])/sum(n) * 100

In [None]:
#create new DataFrame with Vader library scores
analyzer = SentimentIntensityAnalyzer()
exp_df = work_df[(work_df['PD'].str.contains('2020')) & (work_df['NS'].str.contains('Health'))]
sentiment = exp_df['TXT'].apply(analyzer.polarity_scores)
sentiment_df = pd.DataFrame(sentiment.tolist())

In [None]:
# create hist plots to visualize Vader scores
bins = np.arange(-1, 1.25, 0.25)
n, bins, patches = plt.hist(sentiment_df['compound'], bins = bins, histtype = 'bar', color = 'black')

In [None]:
# setting up the y axis to reflect % of all news
centers = np.zeros_like(n)
for ind in range(len(bins) - 1):
    centers[ind] = np.mean([bins[ind], bins[ind + 1]])
plt.bar(centers, n / sum(n), width = 0.24, color = 'black')
plt.ylim([0, 0.5])
plt.savefig('graph_VD_Y_HLT_2020.png', dpi = 300, format = 'png')

In [None]:
# % of positive and neutral news in all news
sum(n[centers>=0])/sum(n)