# Newsletter Generator

## Imports

In [1]:
# basic imports
import newspaper 
from newspaper import Article
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
# nltk imports
import nltk
from nltk.tokenize import word_tokenize
from nltk.text import Text
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/Zac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Zac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# image imports
from PIL import Image
import requests
from io import BytesIO

In [4]:
# stopping the working from slice of dataframe from coming up
pd.options.mode.chained_assignment = None

In [5]:
# summariser imports
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

In [6]:
# export imports
import os
import shutil
from datetime import datetime

In [7]:
# weather import
from pyowm.owm import OWM

In [8]:
# stock inports
import pandas_datareader as pdr
from datetime import datetime, timedelta

## Getting Articles

websites = ['https://www.theverge.com', 'https://medium.com/tag/technology', 'https://towardsdatascience.com', 'https://python.plainenglish.io/']

In [9]:
websites = ['https://medium.com/tag/technology', 'https://towardsdatascience.com', 'https://python.plainenglish.io/', 'https://www.kdnuggets.com', 'https://www.dataversity.net', 'https://www.ibm.com/blogs/journey-to-ai/', 'https://insidebigdata.com', 'https://www.datarobot.com/blog/', 'https://dataaspirant.com']

In [10]:
# function to pull articles from websites:

def websites_pull(websites, no_articles):

    website = []
    title = []
    tags = []
    body = []
    authors = []
    top_image = []
    keywords = []
        
    for url in websites:

        paper = newspaper.build(url, memoize_articles=False)
        
        paper_articles = []

        print(url)

        for article in paper.articles[1:30]:
                if '#comments' not in article.url:
                    paper_articles.append(article.url)

        if len(paper_articles) > no_articles:

            for i in tqdm(range(no_articles)):

                try:

                    article = Article(paper_articles[i], language="en")

                    article.download()
                    article.parse()

                    website.append(article.url)

                    title.append(article.title)

                    body.append(article.text)

                    authors.append(article.authors)

                    top_image.append(article.top_image)

                    article.nlp()

                    keywords.append(article.keywords)

                except:
                    continue
        
        else:
            for i in tqdm(range(len(paper_articles))):

                    try:

                        article = Article(paper_articles[i], language="en")

                        article.download()
                        article.parse()

                        website.append(article.url)

                        title.append(article.title)

                        body.append(article.text)

                        authors.append(article.authors)

                        top_image.append(article.top_image)

                        article.nlp()

                        keywords.append(article.keywords) 
                    
                    except:
                        continue

    df = pd.DataFrame({'title':title, 'authors':authors, 'body':body, 'website':website,
    'image':top_image, 'keywords':keywords})

    return df

In [11]:
# put in a list of websites and the number of articles desired
# returns a dataframe of articles

df = websites_pull(websites, 20)

https://medium.com/tag/technology


100%|██████████| 20/20 [00:41<00:00,  2.06s/it]


https://towardsdatascience.com


100%|██████████| 20/20 [00:41<00:00,  2.08s/it]


https://python.plainenglish.io/


100%|██████████| 20/20 [00:46<00:00,  2.35s/it]


https://www.kdnuggets.com


100%|██████████| 20/20 [00:11<00:00,  1.76it/s]


https://www.dataversity.net


100%|██████████| 15/15 [00:16<00:00,  1.07s/it]


https://www.ibm.com/blogs/journey-to-ai/


100%|██████████| 16/16 [00:19<00:00,  1.24s/it]


https://insidebigdata.com


100%|██████████| 20/20 [00:13<00:00,  1.44it/s]


https://www.datarobot.com/blog/


100%|██████████| 13/13 [00:14<00:00,  1.13s/it]


https://dataaspirant.com


100%|██████████| 3/3 [00:05<00:00,  1.86s/it]


In [12]:
# to be able to reset data frame without running above code again

'''df = pd.DataFrame({'title':title, 'authors':authors, 'body':body, 'website':website,
 'image':top_image, 'keywords':keywords})'''

"df = pd.DataFrame({'title':title, 'authors':authors, 'body':body, 'website':website,\n 'image':top_image, 'keywords':keywords})"

## Cleaning the data:

In [13]:
# getting rid of duplicate articles
df['title'] = df['title'].drop_duplicates()
df = df.dropna()
df.reset_index(drop=True, inplace=True)

In [14]:
# cleaning the main text - removing \n
df['body'] = df['body'].apply(lambda x:x.replace('\n', ' '))

In [15]:
# removing double spaces
df['body'] = df['body'].apply(lambda x:' '.join(x.split()))

In [16]:
# cleaning urls
df['website']=df['website'].apply(lambda x:x.split('?')[0])

In [17]:
# dropping short articles - sometimes not all the article is scraped
df.drop(df[df['body'].str.len() < 1000].index, inplace=True)
df.reset_index(drop=True, inplace=True)

In [18]:
'''# drop articles which start with a number (summariser can't cope with e.g. 7 steps to...) 
# if number is not 20 - just to keep 2021 and 2022 articles in the mix
df.drop(df[(df['title'].str[0].str.isdigit()) & (df['title'].str[0:2] != '20')].index, inplace=True)
df.reset_index(drop=True, inplace=True)'''

"# drop articles which start with a number (summariser can't cope with e.g. 7 steps to...) \n# if number is not 20 - just to keep 2021 and 2022 articles in the mix\ndf.drop(df[(df['title'].str[0].str.isdigit()) & (df['title'].str[0:2] != '20')].index, inplace=True)\ndf.reset_index(drop=True, inplace=True)"

## Getting Collocations

In [19]:
## bringing back most frequent word pairs found in each article

col_final = []

for i in range(len(df['body'])):

    textfile = df['body'][i]
            
    tokens = word_tokenize(textfile) # getting tokens

    lower_tokens = [token.lower() for token in tokens] # making everying lower case

    clean1 = [word for word in lower_tokens if word.isalpha()] # getting rid of numbers

    stop_words = stopwords.words('english') # getting stop words
    word_m_stop = [word for word in clean1 if not word in stop_words]
    final_text = Text(word_m_stop)

    col_list = final_text.collocation_list()

    col_list = [list(i) for i in col_list]

    col_med = []

    for i in range(len(col_list)):
        sentence = col_list[i]
        col_med.append(' '.join(sentence))
        
    col_final.append(col_med)
    
df['collocations'] = col_final

## Filtering for Relevancy

In [20]:
filter_words_upper = ['Artificial Intelligence',
 'Big Data', ''
 'Clustering',
 'Python',
 'Outlier',
 'Data Science',
 'Data Warehouse',
 'Machine Learning',
 'Artificial Intelligence',
 'Data Analysis',
 'Data Engineering',
 'Data Visualization',
 'Data Wrangling',
 'Box Plot',
 'Correlation',
 'dashboard',
 'EDA',
 'Histogram',
 'Hypothesis',
 'Iteration',
 'AWS',
 'azure',
 'Numpy',
 'pandas',
 'matplotlib',
 'seaborn',
 'Bayes Theorem',
 'Decision Tree',
 'Quantile',
 'Predictive Modelling',
 'Standard Deviation',
 'Random Forest',
 'boolean',
 'Fuzzy Logic',
 'Regression',
 'Classification',
 'Overfit',
 'underfit',
 'Statistical Significance',
 'Variance',
 'Deep Learning',
 'Feature Selection',
 'Supervised Machine Learning',
 'Unsupervised Machine Learning',
 'Binary Variable',
 'Binomial Distribution',
 'Computer Vision',
 'Confusion Matrix',
 'covariance',
 'Degrees of Freedom',
 'Evaluation Metrics',
 'F-Score',
 'Hadoop',
 'Hyperparameter',
 'IQR',
 'Keras',
 'kNN',
 'NoSQL',
 'Normal Distribution',
 'Normalize',
 'One Hot Encoding',
 'dummies',
 'recall',
 'P-Value',
 'roc',
 'auc',
 'Root Mean Squared Error',
 'rmse',
 'Skewness',
 'SMOTE',
 'stadardize',
 'Standard error',
 'TensorFlow',
 'Univariate Analysis',
 'Z-test',
 'Residual',
 'Neural Network',
 'Autoregression',
 'Backpropogation',
 'Bagging',
 'Bias-Variance Trade-off',
 'Boosting',
 'Bootstrapping',
 'Classification Threshold',
 'Convex Function',
 'Cosine Similarity',
 'Cost Function',
 'Cross Entropy',
 'Cross Validation',
 'DBScan',
 'Decision Boundary',
 'Dplyr',
 'Early Stopping',
 'Feature Hashing',
 'Gated Recurrent Unit',
 'Hidden Markov Model',
 'Hierarchical Clustering',
 'Holdout Sample',
 'Holt-Winters Forecasting',
 'Imputation',
 'K-Means',
 'Kurtosis',
 'Lasso',
 'Maximum Likelihood Estimation',
 'Multivariate Analysis',
 'Naive Bayes',
 'Polynomial Regression',
 'Ridge Regression',
 'Rotational Invariance',
 ' Semi-Supervised Learning',
 'Stochastic Gradient Descent',
 'SVM']

In [21]:
# making filtered words lower
filter_words = [word.lower() for word in filter_words_upper]

In [22]:
# creating masks - getting boolean arrays for where keywords, collocations or title contain keywords
mask = np.array([bool(set(map(str, x)) & set(filter_words)) for x in df['collocations']])
mask2 = np.array([bool(set(map(str, x)) & set(filter_words)) for x in df['keywords']])
mask3 = np.array([bool(set(map(str, x)) & set(filter_words)) for x in df['title']])

In [23]:
# filtering by mask and resetting index
df_filtered = df[mask | mask2 | mask3]
df_filtered.reset_index(drop=True, inplace=True)

In [24]:
# deal with towards data heading !!!
# deal with first line issue with body. Does it matter?? !!!

## Tagging articles - beginner, intermediate, advanced

In [25]:
## getting lower_case keywords

# keywords
beginner_upper = ['Artificial Intelligence', 'Big Data', 'Python', 'Outlier', 'Data Science', 'Data Warehouse', 'Machine Learning', 'Artificial Intelligence', 'Data Analysis', 'Data Engineering', 'Data Visualization', 'Data Wrangling', 'Box Plot', 'Correlation', 'dashboard', 'EDA', 'Histogram', 'Hypothesis', 'Iteration', 'AWS', 'azure', 'Numpy', 'pandas', 'matplotlib', 'seaborn']

beginner = []

for word in beginner_upper:
    word = word.lower()
    beginner.append(word)

medium_upper = ['Clustering','Bayes Theorem', 'Decision Tree', 'Quantile', 'Predictive Modelling', 'Standard Deviation', 'Random Forest', 'boolean', 'Fuzzy Logic', 'Regression', 'Classification', 'Overfit', 'underfit', 'Statistical Significance', 'Variance', 'Deep Learning', 'Feature Selection', 'Supervised Machine Learning', 'Unsupervised Machine Learning', 'Binary Variable', 'Binomial Distribution', 'Computer Vision', 'Confusion Matrix', 'covariance', 'Degrees of Freedom', 'Evaluation Metrics', 'F-Score', 'Hadoop', 'Hyperparameter', 'IQR', 'Keras', 'kNN', 'NoSQL', 'Normal Distribution', 'Normalize', 'One Hot Encoding', 'dummies', 'recall', 'P-Value', 'roc', 'auc', 'Root Mean Squared Error', 'rmse', 'Skewness', 'SMOTE', 'standardize', 'Standard error', 'TensorFlow', 'Univariate Analysis', 'Z-test']

medium = []

for word in medium_upper:
    word = word.lower()
    medium.append(word)

advanced_upper = ['Residual', 'Neural Network', 'Autoregression', 'Backpropogation', 'Bagging', 'Bias-Variance Trade-off', 'Boosting', 'Bootstrapping', 'Classification Threshold', 'Convex Function', 'Cosine Similarity', 'Cost Function', 'Cross Entropy', 'Cross Validation', 'DBScan', 'Decision Boundary', 'Dplyr', 'Early Stopping', 'Feature Hashing', 'Gated Recurrent Unit', 'Hidden Markov Model', 'Hierarchical Clustering', 'Holdout Sample', 'Holt-Winters Forecasting', 'Imputation', 'K-Means', 'Kurtosis', 'Lasso', 'Maximum Likelihood Estimation', 'Multivariate Analysis', 'Naive Bayes', 'Polynomial Regression', 'Ridge Regression', 'Rotational Invariance', ' Semi-Supervised Learning', 'Stochastic Gradient Descent', 'SVM']

advanced = []

for word in advanced_upper:
    word = word.lower()
    advanced.append(word)

In [26]:
# adding columns to dataframe

beg = []
med = []
adv = []


for i in range(len(df_filtered['body'])):

    beg_count = 0

    for word in beginner:
        if word in df_filtered['body'][i]:
            beg_count +=1

    med_count = 0

    for word in medium:
        if word in df_filtered['body'][i]:
            med_count +=1

    adv_count = 0

    for word in advanced:
        if word in df_filtered['body'][i]:
            adv_count +=1

    total_count = beg_count + med_count + adv_count

    if total_count != 0:

        beg_percentage = beg_count/total_count
        med_percentage = med_count/total_count
        adv_percentage = adv_count/total_count

        beg.append(beg_percentage)
        med.append(med_percentage)
        adv.append(adv_percentage)

    else:
        beg.append(0)
        med.append(0)
        adv.append(0)

df_filtered['percentage_beginner'] = beg
df_filtered['percentage_medium'] = med
df_filtered['percentage_advanced'] = adv


In [27]:
# getting back tag column
# adjusted advanced threshold

tags = []

for i in range(len(df_filtered)):
    if df_filtered['percentage_advanced'][i] >= 0.3:
        tags.append('Advanced')
    elif df_filtered['percentage_medium'][i] >= 0.5:
        tags.append('Intermediate')
    else:
        tags.append('Beginner')

df_filtered['tag'] = tags

In [28]:
df_filtered.head()

Unnamed: 0,title,authors,body,website,image,keywords,collocations,percentage_beginner,percentage_medium,percentage_advanced,tag
0,Eight “No-Code” Features In Python,[Christopher Tao],Eight “No-Code” Features In Python One of the ...,https://towardsdatascience.com/eight-no-code-f...,https://miro.medium.com/max/1200/1*KTo1ShsunQ4...,"[nocode, code, string, m, python, dont, able, ...","[web server, python cli, without writing, writ...",1.0,0.0,0.0,Beginner
1,Feature Selection in Scikit-learn,[Zolzaya Luvsandorj],📍 3. Feature selection We will look at five di...,https://towardsdatascience.com/feature-selecti...,https://miro.medium.com/max/1200/0*PrHExWptvaF...,"[scikitlearn, selected, roc, feature, select, ...","[data roc, roc auc, test data, training data, ...",0.142857,0.857143,0.0,Intermediate
2,A Guide to Problem-Solving in the Data Industry,[],1. Occam’s razor — don’t over-complicate thing...,https://towardsdatascience.com/a-guide-to-prob...,https://miro.medium.com/max/1200/0*p2SX4Leg-oq...,"[work, dont, best, actually, excel, problemsol...","[problem solving, group work, needed think, da...",0.75,0.0,0.25,Beginner
3,Sell Out Sell In Forecasting,[Bartosz Szabłowski],"Create X variables based on cyclic variables, ...",https://towardsdatascience.com/sell-out-sell-i...,https://miro.medium.com/max/1200/1*uoliQIioMkY...,"[learning, weeks, model, forecast, week, varia...","[machine learning, image author, multioutput r...",0.25,0.625,0.125,Intermediate
4,13 Advanced Python Scripts For Everyday Progra...,[Haider Imtiaz],1. SpeedTest with Python This advanced script ...,https://python.plainenglish.io/13-advanced-pyt...,https://miro.medium.com/max/1200/1*O5taq5fU3tz...,"[scripts, import, code, everyday, script, pyth...","[pip install, recycle bin, window version, hex...",0.5,0.5,0.0,Intermediate


## Getting Read_time

In [29]:

## getting read time
# according to the internet a person reads around 238 words per minute
import math

df_filtered['read_time'] = df_filtered['body'].apply(lambda x:math.ceil(len(x.split())/238))

## Choosing Articles to Suggest - currently random

In [30]:
# trying to get make sure there aren't two articles from the same webpage - works now

# finish this!!

'''if len(df_filtered) > 5:
    df_sample = df_filtered.sample(n=5)
    df_sample = df_sample['website'].apply(lambda x:x.split('/')[2]).drop_duplicates()
    
    while len(df_sample) < 5:
        df_sample = df_filtered.sample(n=5)
        df_sample = df_sample['website'].apply(lambda x:x.split('/')[2]).drop_duplicates()

df_sample'''

"if len(df_filtered) > 5:\n    df_sample = df_filtered.sample(n=5)\n    df_sample = df_sample['website'].apply(lambda x:x.split('/')[2]).drop_duplicates()\n    \n    while len(df_sample) < 5:\n        df_sample = df_filtered.sample(n=5)\n        df_sample = df_sample['website'].apply(lambda x:x.split('/')[2]).drop_duplicates()\n\ndf_sample"

In [31]:
# choosing 5 randomly

'''if len(df_filtered) > 5:
    df_filtered = df_filtered.sample(n=5)

df_filtered = df_filtered.reset_index(drop=True)'''

'if len(df_filtered) > 5:\n    df_filtered = df_filtered.sample(n=5)\n\ndf_filtered = df_filtered.reset_index(drop=True)'

## Getting Summaries - BART Summariser

In [32]:
# getting model

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [33]:
def get_summaries(df_filtered, length):

    summaries = []

    for i in range(len(df_filtered['body'])):
        document = df_filtered['body'][i]

        inputs = tokenizer([document], max_length=1024, return_tensors='pt', truncation=True)

        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=length, early_stopping=True, length_penalty=2)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)

        summaries.append(summary)
    
    df_filtered['summary'] = summaries

    return df_filtered

In [34]:
# add dataframe and max length of summary

df_filtered = get_summaries(df_filtered, 120)

### cleaning summaries

In [35]:
# removing last sentence from summary if not complete

sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

summaries_complete = []

for i in range(len(df_filtered['summary'])):
    string = df_filtered['summary'][i]

    sentences = sent_tokenizer.tokenize(string)

    add = []

    for i in range(len(sentences)):
        if sentences[i][-1] == '.':
            add.append(sentences[i])


    summaries_complete.append(" ".join(s for s in add))

df_filtered['summary'] = summaries_complete

In [36]:
# removing sentences which are the same as the title

sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

summaries_no_title = []

for i in range(len(df_filtered['summary'])):
    string = df_filtered['summary'][i]

    sentences = sent_tokenizer.tokenize(string)

    add = []

    for i in range(len(sentences)):
        if sentences[i][:-1] != df_filtered['title'][i]:
            add.append(sentences[i])

    summaries_no_title.append(" ".join(s for s in add))

df_filtered['summary'] = summaries_no_title

In [37]:
# removing any white space from start and end of summaries
df_filtered['summary'] = df_filtered['summary'].apply(lambda x:x.strip())

In [38]:
# remove summaries that contain code - it is usually incomplete and not helpful

## Newspaper Function - exports articles with relevant information and get's back weather information (and stocks)

In [51]:
def newsletter():

    # make a folder to put contents

    path = '/Users/Zac/Desktop/Newsletter - ' + datetime.strftime(datetime.now(), '%d-%m-%Y') + '/'

    if os.path.exists(path):
        shutil.rmtree(path)

    os.mkdir(path)

    with open(os.path.join('/Users/Zac/Desktop/Newsletter - ' + datetime.strftime(datetime.now(), '%d-%m-%Y') + '/','Summaries.txt'), "w") as f:
        f.write(datetime.strftime(datetime.now(), '%A %d %B') + '\n\n')

    for i in range(len(df_filtered)):

        # specifify text file

        with open('/Users/Zac/Desktop/Newsletter - ' + datetime.strftime(datetime.now(), '%d-%m-%Y') + '/Summaries.txt', 'a') as f:
            f.write('Tag: ' + df_filtered['tag'][i].upper() + ' - Read Time: ' + str(df_filtered['read_time'][i]) + '\n\n' + df_filtered['title'][i] + '\n\n' + df_filtered['summary'][i] + '\n\n' + 'Read the full story here: ' + df_filtered['website'][i] + '\n\n' + '---------------' + '\n\n')

        # images

        try:
            response = requests.get(df_filtered['image'][i])
            img = Image.open(BytesIO(response.content))
            img.save('/Users/Zac/Desktop/Newsletter - ' + datetime.strftime(datetime.now(), '%d-%m-%Y') + '/' + df_filtered['title'][i] +'.png')

        except:
            try:
                img.save('/Users/Zac/Desktop/Newsletter - ' + datetime.strftime(datetime.now(), '%d-%m-%Y') + '/' + df_filtered['title'][i] +'.jpg')
            except:
                print('Image error on', df_filtered['title'][i])
        continue


    # weather

    owm = OWM('b33c2d9566ba8b3c3260afc40c91d012')
    mgr = owm.weather_manager()
    observation = mgr.one_call(lat = 41.38879, lon = 2.15899)
    weather = observation.forecast_daily[0]

    # saving weather to disc

    path = '/Users/Zac/Desktop/Newsletter - ' + datetime.strftime(datetime.now(), '%d-%m-%Y') + '/Weather/'
    
    # make a folder to put contents

    if os.path.exists(path):
        shutil.rmtree(path)

    os.mkdir(path)

    with open(os.path.join('/Users/Zac/Desktop/Newsletter - ' + datetime.strftime(datetime.now(), '%d-%m-%Y') + '/Weather/' + 'weather.txt'), "w") as f:
        
        f.write('Barcelona'.upper()+ '\n\n')

        weather_dict = {'clear sky':'Clear Skies', 'few clouds':'Partly Cloudy', 'scattered clouds':'Partly Cloudy', 'broken clouds':'Cloudy', 'shower rain':'Light Showers', 'rain':'Rain', 'thunderstorm':'Stormy', 'snow':'Snow', 'mist':'Misty'}

        f.write(weather_dict[weather.detailed_status].upper() + '\n\n')
    
        # high and low temp
        f.write('Low/High: ' + str(round(weather.temperature('celsius')['min'])) + '°/' + str(round(weather.temperature('celsius')['max']))+ '°\n\n')

        # sunrise
        timestamp = datetime.fromtimestamp(weather.srise_time)
        f.write('Sunrise: ' + timestamp.strftime('%H:%M') + '\n')

        # sunset
        timestamp = datetime.fromtimestamp(weather.sset_time)
        f.write('Sunset: '+ timestamp.strftime('%H:%M'))

    # getting back weather icon

    try:

        url = weather.weather_icon_url()

        response = requests.get(url)
        img = Image.open(BytesIO(response.content))

        img.save('/Users/Zac/Desktop/Newsletter - ' + datetime.strftime(datetime.now(), '%d-%m-%Y') + '/Weather/' + 'weathericon' + '.png')

    except:
        print('image error')

    # stocks

    stock_weekdays = ['Tue', 'Wed', 'Thu', 'Fri']

    stock_dict = {'^GSPC':'S&P 500', '^IBEX':'IBEX 35', '^IXIC':'NASDAQ', '^DJI':'DOW'}

    path = '/Users/Zac/Desktop/Newsletter - ' + datetime.strftime(datetime.now(), '%d-%m-%Y') + '/Stocks/'
    
    # make a folder to put contents

    if os.path.exists(path):
        shutil.rmtree(path)

    os.mkdir(path)

    with open(os.path.join('/Users/Zac/Desktop/Newsletter - ' + datetime.strftime(datetime.now(), '%d-%m-%Y') + '/Stocks/' + 'stocks.txt'), "w") as f:
        f.write('Stocks: \n\n')

    # get daily stock change for weekdays

    if datetime.strftime(datetime.now() - timedelta(1), '%a') in stock_weekdays:

        for stock_ticker in stock_dict.keys():

            df = pdr.get_data_yahoo(stock_ticker)
            df = df.reset_index()
            yesterday_close = df[df['Date'] == datetime.strftime(datetime.now() - timedelta(1), '%Y-%m-%d')]['Close'].item()

            try:
                day_before_yesterday_close = df[df['Date'] == datetime.strftime(datetime.now() - timedelta(2), '%Y-%m-%d')]['Close'].item()
            except:
                print('No stock data available today.')

            change = yesterday_close - day_before_yesterday_close
            percentage_change = round((change/day_before_yesterday_close)*100, 2)

            with open(os.path.join('/Users/Zac/Desktop/Newsletter - ' + datetime.strftime(datetime.now(), '%d-%m-%Y') + '/Stocks/' + 'stocks.txt'), "a") as f:

                if percentage_change > 0:
                
                    f.write(stock_dict[stock_ticker] + ' +' + str(percentage_change) +  '%\n\n')
                
                else:
                    f.write(stock_dict[stock_ticker]+ ' ' +str(percentage_change) + '%\n\n')

    # if not a weekday, get yearly change

    else:
        for stock_ticker in stock_dict.keys():

            df = pdr.get_data_yahoo(stock_ticker)
            df = df.reset_index()

            start_year = df[df['Date'] == datetime.strftime(datetime(datetime.today().year, 2, 1), '%Y-%m-%d')]['Close'].item()
            yesterday_close = df[df['Date'] == datetime.strftime(datetime.now() - timedelta(1), '%Y-%m-%d')]['Close'].item()

            change = yesterday_close - start_year
            percentage_change = round((change/start_year)*100, 2)

            if percentage_change > 0:
                    
                f.write('Yearly Change: ' + stock_dict[stock_ticker] + ' +' + str(percentage_change) +  '%')
                    
            else:
                f.write('Yearly Change: ' + stock_dict[stock_ticker]+ ' ' +str(percentage_change) + '%')

In [52]:
newsletter()

Image error on Optibrium Shows Deep Learning to Successfully Predict Human Panel-based Sensory Perception of Novel Compounds Used for Flavors and Fragrances
Image error on “Above the Trend Line” – Your Industry Rumor Central for 12/10/2021
Image error on Heard on the Street – 11/30/2021


# Wishlist

## Getting words over time graph

In [41]:
'''import pandas as pd
from pytrends.request import TrendReq

pytrends = TrendReq(hl='en-UK', tz=60)'''

"import pandas as pd\nfrom pytrends.request import TrendReq\n\npytrends = TrendReq(hl='en-UK', tz=60)"

In [42]:
# Use NLTK to get back top 3 words/collocations

# then plot a graph of this information at the bottom of the newsletter

# today's top words

In [43]:
#df_filtered['body']

In [44]:
weather_dict = {'clear sky':'Clear Skies', 'few clouds':'Partly Cloudy', 'scattered clouds':'Partly Cloudy', 'broken clouds':'Cloudy', 'shower rain':'Light Showers', 'rain':'Rain', 'thunderstorm':'Stormy', 'snow':'Snow', 'mist':'Misty'}


In [45]:
weather_dict['clear sky']

'Clear Skies'

In [46]:
'''list_of_bad_words = ['clear sky', 'few clouds']

for word in list_of_bad_words:
    print(weather_dict[word])

    df['lyrics'].apply(lambda x:x.replace(weather_dict[word], ))'''

"list_of_bad_words = ['clear sky', 'few clouds']\n\nfor word in list_of_bad_words:\n    print(weather_dict[word])\n\n    df['lyrics'].apply(lambda x:x.replace(weather_dict[word], ))"