## Scraping tweets from El Paso, Lubbock, Amarillo, Laredo, Odessa, and Aberline


In [12]:
import numpy as np
import pandas as pd
import GetOldTweets3 as got
import time
from calendar import Calendar, monthrange
from datetime import datetime, timedelta
import re

In [13]:
# Define a function to request tweets within a certain time period
def get_tweets(query, since, until, max_tweets,location, radius):
    # Set our tweet criteria using GetOldTweets3
    tweetCriteria = got.manager.TweetCriteria()\
                                            .setQuerySearch(query)\
                                            .setSince(since)\
                                            .setUntil(until)\
                                            .setMaxTweets(max_tweets)\
                                            .setNear(location)\
                                            .setWithin(radius)
    
    tweet = got.manager.TweetManager.getTweets(tweetCriteria)
    return tweet

In [14]:
def create_df(tweet):
    total_list = []
    for i in range(len(tweet)):
        my_dict = {
            "id" : tweet[i].id,
            "text" : tweet[i].text,
            "date" : tweet[i].date,
            "retweets" : tweet[i].retweets,
            "favorites" : tweet[i].favorites,
            "mentions" : tweet[i].mentions,
            "hashtags" : tweet[i].hashtags,
            "geo" : tweet[i].geo  
        }
        total_list.append(my_dict)
    return pd.DataFrame(total_list)

In [15]:
since_list = ["2020-08-30", "2020-08-31", "2020-09-01", "2020-09-02", "2020-09-03", "2020-09-04", "2020-09-05"]

until_list = ["2020-08-31", "2020-09-01", "2020-09-02", "2020-09-03", "2020-09-04", "2020-09-05","2020-09-06"]

In [16]:
def all_tweets(region, query, since, until,
                max_tweets, location, radius):
    t0 = time.time() 
    df = pd.DataFrame()
    for i in range(len(since)):
        x = get_tweets(query=query,
                   since=since[i],
                   until=until[i],
                   max_tweets = max_tweets,
                   location = location,
                   radius = radius
                  )
        df = pd.concat([df,create_df(x)])
        print(f"Index {i}: day {since_list[i]} complete at {t0 - time.time()} seconds")
        if i < len(since)-1:
            time.sleep(60)
    print(f"Final run time: {t0 - time.time()} seconds")
    df["region"] = region
    df.to_csv(f"{region}_data.csv")
    return df

In [17]:
el_paso = all_tweets(
                region =  "El_Paso",
                query = ' ',
                since = since_list,
                until = until_list,
                max_tweets = 2000,
                location = "31.7619, -106.4850",
                radius = "20mi")

Index 0: day 2020-08-30 complete at -55.494654417037964 seconds
Index 1: day 2020-08-31 complete at -175.6429831981659 seconds
Index 2: day 2020-09-01 complete at -293.6534070968628 seconds
Index 3: day 2020-09-02 complete at -413.12114930152893 seconds
Index 4: day 2020-09-03 complete at -532.5497512817383 seconds
Index 5: day 2020-09-04 complete at -652.437087059021 seconds
Index 6: day 2020-09-05 complete at -769.1250152587891 seconds
Final run time: -769.1250152587891 seconds


In [18]:
el_paso_covid = el_paso['text'].str.contains('covid|coronavirus', flags=re.IGNORECASE, regex=True).sum()
el_paso_covid

#s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
#mel_count=a['Names'].str.contains('Mel').sum()
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html#pandas.Series.str.contains

60

In [20]:
lubbock = all_tweets(
                region =  "Lubbock",
                query = ' ',
                since = since_list,
                until = until_list,
                max_tweets = 2000,
                location = "33.5779, -101.8552",
                radius = "12mi")

Index 0: day 2020-08-30 complete at -21.305082321166992 seconds
Index 1: day 2020-08-31 complete at -105.15097951889038 seconds
Index 2: day 2020-09-01 complete at -188.64397144317627 seconds
Index 3: day 2020-09-02 complete at -272.37101793289185 seconds
Index 4: day 2020-09-03 complete at -355.7938964366913 seconds
Index 5: day 2020-09-04 complete at -440.66676688194275 seconds
Index 6: day 2020-09-05 complete at -557.369528055191 seconds
Final run time: -557.369528055191 seconds


In [21]:
lubbock_covid= lubbock['text'].str.contains('covid|coronavirus', flags=re.IGNORECASE, regex=True).sum()
lubbock_covid

87

In [22]:
amarillo = all_tweets(
                region =  "Amarillo",
                query = ' ',
                since = since_list,
                until = until_list,
                max_tweets = 2000,
                location = "35.2220, -101.8313",
                radius = "13mi")

Index 0: day 2020-08-30 complete at -5.924284219741821 seconds
Index 1: day 2020-08-31 complete at -73.04520177841187 seconds
Index 2: day 2020-09-01 complete at -141.26595616340637 seconds
Index 3: day 2020-09-02 complete at -209.47456884384155 seconds
Index 4: day 2020-09-03 complete at -279.32396626472473 seconds
Index 5: day 2020-09-04 complete at -348.95183300971985 seconds
Index 6: day 2020-09-05 complete at -462.9198319911957 seconds
Final run time: -462.9198319911957 seconds


In [23]:
amarillo_covid = amarillo['text'].str.contains('covid|coronavirus', flags=re.IGNORECASE, regex=True).sum()
amarillo_covid

30

In [24]:
laredo = all_tweets(
                region =  "Laredo",
                query = ' ',
                since = since_list,
                until = until_list,
                max_tweets = 2000,
                location = "27.5199841,-99.4953764",
                radius = "9mi")

Index 0: day 2020-08-30 complete at -9.5302414894104 seconds
Index 1: day 2020-08-31 complete at -80.47281980514526 seconds
Index 2: day 2020-09-01 complete at -153.67202138900757 seconds
Index 3: day 2020-09-02 complete at -224.20110297203064 seconds
Index 4: day 2020-09-03 complete at -296.73204231262207 seconds
Index 5: day 2020-09-04 complete at -369.489529132843 seconds
Index 6: day 2020-09-05 complete at -484.9079625606537 seconds
Final run time: -484.9089596271515 seconds


In [25]:
laredo_covid = laredo['text'].str.contains('covid|coronavirus', flags=re.IGNORECASE, regex=True).sum()
laredo_covid

28

In [26]:
odessa = all_tweets(
                region =  "Odessa",
                query = ' ',
                since = since_list,
                until = until_list,
                max_tweets = 2000,
                location = "31.8457, -102.3676",
                radius = "14mi")

Index 0: day 2020-08-30 complete at -4.749912738800049 seconds
Index 1: day 2020-08-31 complete at -72.40460681915283 seconds
Index 2: day 2020-09-01 complete at -142.0024437904358 seconds
Index 3: day 2020-09-02 complete at -209.2247281074524 seconds
Index 4: day 2020-09-03 complete at -278.6753468513489 seconds
Index 5: day 2020-09-04 complete at -347.9718837738037 seconds
Index 6: day 2020-09-05 complete at -461.8415994644165 seconds
Final run time: -461.8415994644165 seconds


In [27]:
odessa_covid = odessa['text'].str.contains('covid|coronavirus', flags=re.IGNORECASE, regex=True).sum()
odessa_covid

22

In [28]:
abilene = all_tweets(
                region =  "Abilene",
                query = ' ',
                since = since_list,
                until = until_list,
                max_tweets = 2000,
                location = "32.4487, -99.7331",
                radius = "9mi")

Index 0: day 2020-08-30 complete at -9.042999029159546 seconds
Index 1: day 2020-08-31 complete at -80.21705770492554 seconds
Index 2: day 2020-09-01 complete at -151.27152585983276 seconds
Index 3: day 2020-09-02 complete at -219.85091471672058 seconds
Index 4: day 2020-09-03 complete at -289.92616605758667 seconds
Index 5: day 2020-09-04 complete at -361.370646238327 seconds


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



An error occured during an HTTP request: [WinError 10053] An established connection was aborted by the software in your host machine
Try to open in browser: https://twitter.com/search?q=%20%20near%3A%2232.4487%2C%20-99.7331%22%20within%3A9mi%20since%3A2020-09-05%20until%3A2020-09-06&src=typd
Traceback (most recent call last):
  File "C:\Users\12155\anaconda3\lib\site-packages\GetOldTweets3\manager\TweetManager.py", line 344, in getJsonResponse
    jsonResponse = response.read()
  File "C:\Users\12155\anaconda3\lib\http\client.py", line 470, in read
    s = self._safe_read(self.length)
  File "C:\Users\12155\anaconda3\lib\http\client.py", line 620, in _safe_read
    chunk = self.fp.read(min(amt, MAXAMOUNT))
  File "C:\Users\12155\anaconda3\lib\socket.py", line 589, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\12155\anaconda3\lib\ssl.py", line 1071, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Users\12155\anaconda3\lib\ssl.py", line 929, in read
    ret

SystemExit: 

In [None]:
abilene_covid = abilene['text'].str.contains('covid|coronavirus', flags=re.IGNORECASE, regex=True).sum()
abilene_covid