## Scraping tweets from El Paso, Lubbock, Amarillo, Laredo, Odessa, and Aberline


In [1]:
import numpy as np
import pandas as pd
import GetOldTweets3 as got
import time
from calendar import Calendar, monthrange
from datetime import datetime, timedelta
import re

In [2]:
# Define a function to request tweets within a certain time period
def get_tweets(query, since, until, max_tweets,location, radius):
    # Set our tweet criteria using GetOldTweets3
    tweetCriteria = got.manager.TweetCriteria()\
                                            .setQuerySearch(query)\
                                            .setSince(since)\
                                            .setUntil(until)\
                                            .setMaxTweets(max_tweets)\
                                            .setNear(location)\
                                            .setWithin(radius)
    
    tweet = got.manager.TweetManager.getTweets(tweetCriteria)
    return tweet

In [3]:
def create_df(tweet):
    total_list = []
    for i in range(len(tweet)):
        my_dict = {
            "id" : tweet[i].id,
            "text" : tweet[i].text,
            "date" : tweet[i].date,
            "retweets" : tweet[i].retweets,
            "favorites" : tweet[i].favorites,
            "mentions" : tweet[i].mentions,
            "hashtags" : tweet[i].hashtags,
            "geo" : tweet[i].geo  
        }
        total_list.append(my_dict)
    return pd.DataFrame(total_list)

In [4]:
since_list = ["2020-08-30", "2020-08-31", "2020-09-01", "2020-09-02", "2020-09-03", "2020-09-04", "2020-09-05"]

until_list = ["2020-08-31", "2020-09-01", "2020-09-02", "2020-09-03", "2020-09-04", "2020-09-05","2020-09-06"]

In [5]:
def all_tweets(region, query, since, until,
                max_tweets, location, radius):
    t0 = time.time() 
    df = pd.DataFrame()
    for i in range(len(since)):
        x = get_tweets(query=query,
                   since=since[i],
                   until=until[i],
                   max_tweets = max_tweets,
                   location = location,
                   radius = radius
                  )
        df = pd.concat([df,create_df(x)])
        print(f"Index {i}: day {since_list[i]} complete at {t0 - time.time()} seconds")
        if i < len(since)-1:
            time.sleep(60)
    print(f"Final run time: {t0 - time.time()} seconds")
    df["region"] = region
    df.to_csv(f"{region}_data.csv")
    return df

In [6]:
el_paso = all_tweets(
                region =  "El_Paso",
                query = ' ',
                since = since_list,
                until = until_list,
                max_tweets = 2000,
                location = "31.7619, -106.4850",
                radius = "20mi")

Index 0: day 2020-08-30 complete at -50.82333016395569 seconds
Index 1: day 2020-08-31 complete at -164.36766815185547 seconds
Index 2: day 2020-09-01 complete at -279.12176632881165 seconds
Index 3: day 2020-09-02 complete at -394.385538816452 seconds
Index 4: day 2020-09-03 complete at -510.3688943386078 seconds
Index 5: day 2020-09-04 complete at -621.6554050445557 seconds
Index 6: day 2020-09-05 complete at -733.039519071579 seconds
Final run time: -733.0405173301697 seconds


In [7]:
el_paso_covid = el_paso['text'].str.contains('covid|coronavirus', flags=re.IGNORECASE, regex=True).sum()
el_paso_covid

#s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
#mel_count=a['Names'].str.contains('Mel').sum()
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html#pandas.Series.str.contains

71

In [9]:
lubbock = all_tweets(
                region =  "Lubbock",
                query = ' ',
                since = since_list,
                until = until_list,
                max_tweets = 2000,
                location = "33.5779, -101.8552",
                radius = "12mi")

Index 0: day 2020-08-30 complete at -19.78313636779785 seconds
Index 1: day 2020-08-31 complete at -99.768381357193 seconds
Index 2: day 2020-09-01 complete at -180.91333079338074 seconds
Index 3: day 2020-09-02 complete at -262.95881700515747 seconds
Index 4: day 2020-09-03 complete at -343.6849386692047 seconds
Index 5: day 2020-09-04 complete at -456.7159893512726 seconds
Index 6: day 2020-09-05 complete at -569.8150525093079 seconds
Final run time: -569.8150525093079 seconds


In [10]:
lubbock_covid= lubbock['text'].str.contains('covid|coronavirus', flags=re.IGNORECASE, regex=True).sum()
lubbock_covid

107

In [11]:
amarillo = all_tweets(
                region =  "Amarillo",
                query = ' ',
                since = since_list,
                until = until_list,
                max_tweets = 2000,
                location = "35.2220, -101.8313",
                radius = "13mi")

Index 0: day 2020-08-30 complete at -5.616105318069458 seconds


KeyboardInterrupt: 

In [None]:
amarillo_covid = amarillo['text'].str.contains('covid|coronavirus', flags=re.IGNORECASE, regex=True).sum()
amarillo_covid

In [None]:
laredo = all_tweets(
                region =  "Laredo",
                query = ' ',
                since = since_list,
                until = until_list,
                max_tweets = 2000,
                location = "27.5199841,-99.4953764",
                radius = "9mi")

In [None]:
laredo_covid = laredo['text'].str.contains('covid|coronavirus', flags=re.IGNORECASE, regex=True).sum()
laredo_covid

In [None]:
odessa = all_tweets(
                region =  "Odessa",
                query = ' ',
                since = since_list,
                until = until_list,
                max_tweets = 2000,
                location = "31.8457, -102.3676",
                radius = "14mi")

In [None]:
odessa_covid = odessa['text'].str.contains('covid|coronavirus', flags=re.IGNORECASE, regex=True).sum()
odessa_covid

In [None]:
abilene = all_tweets(
                region =  "Abilene",
                query = ' ',
                since = since_list,
                until = until_list,
                max_tweets = 2000,
                location = "32.4487, -99.7331",
                radius = "9mi")

In [None]:
abilene_covid = abilene['text'].str.contains('covid|coronavirus', flags=re.IGNORECASE, regex=True).sum()
abilene_covid