In [1]:
import os
import requests
import json
import re
import time
import glob
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from os.path import join
from slugify import slugify
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from urllib.parse import urlparse, parse_qs

base_dir = "UofGFood"
tweets_dir = "UofGFood/json-tweets"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)
if not os.path.exists(tweets_dir):
    os.makedirs(tweets_dir)

In [2]:
def date_ms(timestamp):
    return datetime.datetime.fromtimestamp(timestamp)

tweets_with_links = []
for file in glob.glob(join(tweets_dir,"*.json")):
    tweets = None
    with open(file, "r") as f:
        tweets = json.load(f)
    for t in tweets:
        beautiful_soup = BeautifulSoup(t['rawHtml'], "lxml")
        anchors = beautiful_soup.findAll('a')
        if anchors is not None:
            _as = []
            for a in anchors:
                expanded_url = a.get('data-expanded-url')
                if expanded_url is not None:
                    _as.append(expanded_url)
            if len(_as) > 0:
                item = {
                    'tweetId': t['tweetId'],
                    'rawHtml': t['rawHtml'],
                    'timestamp':date_ms(int(t['timestamp'])),
                    'links': _as
                }
                tweets_with_links.append(item)
tweets_with_links = sorted(tweets_with_links, key=lambda item: item["timestamp"])
print("Tweets with potential links",len(tweets_with_links))

Tweets with potential links 838


In [3]:
t_l = [
    tweets_with_links[0:100],
    tweets_with_links[100:200],
    tweets_with_links[200:300],
    tweets_with_links[300:400],
    tweets_with_links[400:500],
    tweets_with_links[500:600],
    tweets_with_links[600:700],
    tweets_with_links[700:800],
    tweets_with_links[800:900],
#    tweets_with_links[900:1000],
#    tweets_with_links[1000:1200]
]
for i in range(3,len(t_l)):
    tll = t_l[i]
    menu = []
    for t in tll:
        for l in t["links"]:
            soup = None
            try:
                soup = BeautifulSoup(requests.get(l).text, "lxml")
            except:
                print("Err link", l)
                continue
            categoryTrail = soup.find('p', {'class':'categoryTrail'})
            if categoryTrail is not None and categoryTrail.text == 'Food':
                title = soup.find('h1').text.strip()
                authorDate = soup.find('p', {'class':'authorDate'})
                author = authorDate.find('a').text.strip()
                date = authorDate.find('span').text.strip()
                articleText = soup.find('div', {'class':'articleText'})

                if articleText is None:
                    continue
                _h3s = []
                textArticle = None
                h3s = articleText.findAll('h3')
                if len(h3s) > 0:
                    for h3 in h3s:
                        _h3s.append(h3.text.strip())
                else:
                    textArticle = articleText.text

                menu_entry = {
                    'tweetId': t["tweetId"],
                    'tweet_date': t["timestamp"].strftime('%Y-%m-%d %H:%M:%S'),
                    'title': title,
                    'date': date,
                    'author': author,
                    'entries': _h3s,
                    'text': textArticle
                }
                
                menu.append(menu_entry)
    with open(join("UofGFood", "menu-"+ str(i) + ".json"), "w") as ww:
        json.dump(menu, ww, indent=4)
    print("Menu", i, len(menu))

Menu 3 66
Err link http://weeks.Feedback
Menu 4 40
Menu 5 44
Menu 6 51
Err link http://Square.Coffee
Menu 7 51
Menu 8 12


In [4]:
t_l = [
    tweets_with_links[0:100],
    tweets_with_links[100:200],
    tweets_with_links[200:300],
    tweets_with_links[300:400],
    tweets_with_links[400:500],
    tweets_with_links[500:600],
    tweets_with_links[600:700],
    tweets_with_links[700:800],
    tweets_with_links[800:900],
    tweets_with_links[900:1000],
    tweets_with_links[1000:1200]
]
for ll in t_l:
    print("Menu", len(ll))

Menu 100
Menu 100
Menu 100
Menu 100
Menu 100
Menu 100
Menu 100
Menu 100
Menu 38
Menu 0
Menu 0


In [6]:
regex_date = re.compile('\w+\s[0-9]+[a-z]{0,2}\s\w+\s[0-9]{4}')

def process_menu(menu):
    clean = []
    search_date = None
    for entry in menu["entries"]:
        cl = ' '.join([x.strip() for x in entry.split('\u00a0') if x != ''])
        search_date = regex_date.search(cl)
        if search_date:
            menu["entry_date"] = cl
        else:
            clean.append(cl)
    if "text" in menu and menu["text"] is not None:
        menu["text"] = ' '.join([x.strip() for x in menu["text"].split('\u00a0') if x != ''])
        
    
    if search_date:
        print(clean[0])
    
    menu["entries"] = clean
    
for file in glob.glob("UofGFood/*.json"):
    menus = None
    with open(file, "r") as menu_json:
        menus = json.load(menu_json)

    for menu in menus:
        process_menu(menu)
    with open(file, "w") as ww:
        json.dump(menus, ww, indent=4)

In [7]:
info_regex = re.compile('([a-zA-Z0-9\s\'&,]+)\s([A-Z]+)$')
key_regex = re.compile('([A-Z]+\s=\s\w+)')

def process_entries(menu):
    new_entries = []
    for entry in menu["entries"]:
        if entry == '':
            continue
        info_search = info_regex.search(entry)
        key_search = key_regex.findall(entry)
        new_entry = {}
        if info_search:
            new_entry["dish"] =  info_search.group(1).strip()
            new_entry["info"] = info_search.group(2).strip()
        elif key_search:
            keys = {}
            for key in key_search:
                ky = key.split('=')
                keys[ky[0].strip()] = ky[1].strip()
            menu["key"] = keys
            continue
        else:
            new_entry["dish"] = entry
        new_entries.append(new_entry)
    menu["entries"] = new_entries
        

for file in glob.glob("UofGFood/*.json"):
    
    menus = None
    with open(file, "r") as menu_json:
        menus = json.load(menu_json)

    for menu in menus:
        process_entries(menu)
        
    with open(file, "w") as ww:
        json.dump(menus, ww, indent=4)

In [8]:
for file in glob.glob("UofGFood/*.json"):
    menus = None
    with open(file, "r") as menu_json:
        menus = json.load(menu_json)
    print(json.dumps(menus, indent=4))
    break

[
    {
        "tweetId": "898466420069191682",
        "tweet_date": "2017-08-18 09:47:41",
        "title": "Today's menu in Food for Thought, Fraser Building",
        "date": "01 Dec 2017",
        "author": "PBrown",
        "entries": [
            {
                "dish": "Yellow split pea & chive soup",
                "info": "VG"
            },
            {
                "dish": "Breaded haddock with chips and lemon wedge"
            },
            {
                "dish": "Breast of chicken korma with rice and naan bread",
                "info": "H"
            },
            {
                "dish": "Omelette with spinach, peppers & goats cheese",
                "info": "V"
            },
            {
                "dish": "Baked potato with smoked pork sausage and BBQ beans topped with cheese"
            },
            {
                "dish": "Wok station - Chinese beef and orange, stir fry veg and rice"
            }
        ],
        "text": null,
      