# All necessary imports.

In [1]:
import pandas as pd
import chardet
import requests
from bs4 import BeautifulSoup

# Checking the encoding of the raw datasets.

In [None]:
# check encoding, should be encoding="ISO-8859-1".
# note: "Windows-1252" is a superset of "ISO-8859-1"
def check_encoding(filename: str):
    with open(filename, 'rb') as file:
        print(chardet.detect(file.read()))

check_encoding("five_ten.csv")
check_encoding("ten_fifteen.csv")
check_encoding("fifteen_twenty.csv")
check_encoding("twenty_twentyfive.csv")
check_encoding("twentyfive_thirty.csv")

{'encoding': 'ISO-8859-1', 'confidence': 0.7222415887282886, 'language': ''}
{'encoding': 'ISO-8859-1', 'confidence': 0.721592522749421, 'language': ''}
{'encoding': 'ISO-8859-1', 'confidence': 0.7201988424168467, 'language': ''}
{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}
{'encoding': 'Windows-1252', 'confidence': 0.7299693486436825, 'language': ''}


# Checking null, empty, incorrect type for all data rows.

In [None]:
def check_filter(filename: str, encoding: str):
    df = pd.read_csv(filename, encoding=encoding)
    total_length = len(df)
    # print(df.head)
    for col in df.columns:
        has_null = df[col].isnull().any()
        if has_null:
            return f"{filename} has null value at {col} \n"
        if len(df[col]) != total_length:
            return f"{filename} has incorrect column length at {col} \n"
    df1 = df.drop_duplicates()
    if len(df1) != total_length:
        return f"{filename} has duplicate rows \n"
    return f"{filename} check filter success \n"

# columns are ['id', 'thread_number', 'timestamp', 'text', 'retweets', 'likes', 'replies']
def check_column_types(filename: str, encoding: str):
    df = pd.read_csv(filename, encoding=encoding)
    if (df['timestamp'].dtype != int) or (df['retweets'].dtype != int) or (df['likes'].dtype != int) or (df['replies'].dtype != int):
        return f"{filename} has incorrect type \n"
    has_negatives1 = (df['retweets'] < 0).any()
    has_negatives2 = (df['likes'] < 0).any()
    has_negatives3 = (df['replies'] < 0).any()
    if has_negatives1 or has_negatives2 or has_negatives3:
        return f"{filename} has negative engagement values \n"
    for thread in df['thread_number']:
        if type(thread) != str:
            return f"{filename} has incorrect type, thread_number not a string \n"
        if thread[0:6] != "Thread":
            return f"{filename} has incorrect name for thread_number \n"
    return f"{filename} check column type success \n"

def add_url_column(filename: str, encoding: str):
    df = pd.read_csv(filename, encoding=encoding)
    total_length = len(df)
    url_column = []
    for data in df.iterrows():
        if data[1]["text"]:
            url_arr = data[1]["text"].split("http")
            if len(url_arr) > 1:
                true_url = "http" + url_arr[1]
                url_column.append(true_url)
                # print(true_url)
            else:
                url_column.append("")
        else:
            url_column.append("")
    if len(url_column) == total_length:
        df["url"] = url_column
        new_csv = filename.split(".")[0]+"_url.csv"
        df.to_csv(new_csv)
        return df
    return 0


In [None]:
# check_filter("five_ten.csv", encoding="ISO-8859-1")
# check_column_types("five_ten.csv", encoding="ISO-8859-1")
# add_url_column("five_ten.csv", encoding="ISO-8859-1")

Unnamed: 0,id,thread_number,timestamp,text,retweets,likes,replies,url
0,999307110902050818,Thread 1,1527088356,Extraordinary evidence at Treasury committee f...,66,59,5,https://t.co/DJhIQhmVwJ
1,999307395712143360,Thread 1,1527088424,The Brexiter favourite Max Fac - would cost bu...,83,107,10,https://t.co/0MwIcwre4t
2,999307826265812992,Thread 1,1527088526,How does he arrive at the figure\r\r\n\r\r\n20...,6,11,2,https://t.co/KxnkU2QiVO
3,999308153346052102,Thread 1,1527088604,Theresa May's New Customs Partnership is much ...,7,10,1,https://t.co/0LcsJHah0H
4,999308653894230022,Thread 1,1527088724,Mr Thompson said he did not expect the EU to r...,17,12,2,https://t.co/9c3uhhnZGX
...,...,...,...,...,...,...,...,...
801,977334201170063360,Thread 101,1521849606,2) Congress basically screwed themselves by no...,33,40,2,https://t.co/ommnR24kPT
802,977334207503515648,Thread 101,1521849607,5) @POTUS must have realized that those â??app...,36,60,3,https://t.co/pIHLTyn6eO
803,977334205469241344,Thread 101,1521849607,4) He spent the money or didnâ??t spend it how...,21,46,5,https://t.co/gxFEaYr3R6
804,977334211743924224,Thread 101,1521849608,6) What if @POTUS decided to tell the #Treasur...,55,84,5,https://t.co/wT76hT941U


In [None]:
print("Inital check for all datasets: ")
check_filter("five_ten.csv", encoding="ISO-8859-1")
check_filter("ten_fifteen.csv", encoding="ISO-8859-1")
check_filter("fifteen_twenty.csv", encoding="ISO-8859-1")
check_filter("twenty_twentyfive.csv", encoding="ISO-8859-1")
check_filter("twentyfive_thirty.csv", encoding="ISO-8859-1")
print("Done. ")

print("Column check for all datasets: ")
check_column_types("five_ten.csv", encoding="ISO-8859-1")
check_column_types("ten_fifteen.csv", encoding="ISO-8859-1")
check_column_types("fifteen_twenty.csv", encoding="ISO-8859-1")
check_column_types("twenty_twentyfive.csv", encoding="ISO-8859-1")
check_column_types("twentyfive_thirty.csv", encoding="ISO-8859-1")
print("Done. ")

Inital check for all datasets: 
Done. 
Column check for all datasets: 
Done. 


In [None]:
print("Generating urls for all datasets: ")
add_url_column("five_ten.csv", encoding="ISO-8859-1")
add_url_column("ten_fifteen.csv", encoding="ISO-8859-1")
add_url_column("fifteen_twenty.csv", encoding="ISO-8859-1")
add_url_column("twenty_twentyfive.csv", encoding="ISO-8859-1")
add_url_column("twentyfive_thirty.csv", encoding="ISO-8859-1")
print("Done. ")

Generating urls for all datasets: 
Done. 


# Grabbing full content for all datasets using url.

In [39]:
def get_full_text_list(short_url: str):
    try:
        if "https://" not in short_url:
            return "bad_url"
        response = requests.get(short_url, allow_redirects=True)
        print(response.url)
        final_url = response.url
        if "https://twitter.com" not in final_url:
            return "bad_url"
        idtemp_arr = final_url.split("/status/")
        if not len(idtemp_arr) > 1:
            return "bad_url"
        idtemp = idtemp_arr[1]
        id = idtemp.split("/")[0]
        url = f'https://threadreaderapp.com/thread/{id}.html'
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        content_divs = soup.find_all('div', class_='content-tweet allow-preview')
        thread_array = []
        for div in content_divs:
            thread = div.get_text(separator="\n", strip=True)
            thread_array.append(thread)
        return thread_array
    except:
        return "bad_url"

def get_full_text(filename: str, encoding:str):
    df = pd.read_csv(filename, encoding=encoding)
    total_length = len(df)
    full_text_dic = {}                                # dictionary that stores full text
    thread_dic = {}                                   # dictionary that stores correct thread length
    good_thread_column = []                           # list that stores all of the good threads
    cur_thread_number = "Thread 0"                    # default to some non existing thread number
    for data in df.iterrows():                        # goal is to get correct thread length
        thread_number = data[1]["thread_number"]
        if thread_number not in thread_dic.keys():
            thread_dic[thread_number] = 1
            full_text_dic[thread_number] = []
        else:
            thread_dic[thread_number] += 1

    should_double_check = []
    grabbed_full_text_or_bad_tweet = 0
    for data in df.iterrows():
        thread_number = data[1]["thread_number"]
        short_url = data[1]["url"]

        # case1: same thread, already grabbed all information or bad tweet
        if thread_number == cur_thread_number and grabbed_full_text_or_bad_tweet:
            continue

        # case2: moved to the next thread
        if thread_number != cur_thread_number:
            cur_thread_number = thread_number
            grabbed_full_text_or_bad_tweet = 0
        if short_url is not None and short_url != '' and isinstance(short_url, str) and not grabbed_full_text_or_bad_tweet:
            print(thread_number + "starting, " + short_url)
            thread_arr = get_full_text_list(short_url)
            if isinstance(thread_arr, str) and thread_arr == "bad_url":
                continue
            if len(thread_arr) > 0:
                if len(thread_arr) == thread_dic[thread_number]:
                    print(thread_number, " matches correct length")
                elif len(thread_arr) < thread_dic[thread_number]:
                    print(thread_number, "has incorrect array length, array too small")
                    should_double_check.append(thread_number)
                else:
                    print(thread_number, "has incorrect array length, array too large")
                    should_double_check.append(thread_number)
                for i in range(thread_dic[thread_number]):
                    if i < len(thread_arr):
                        full_text_dic[thread_number].append(thread_arr[i])
                    else:
                        full_text_dic[thread_number].append("")
                grabbed_full_text_or_bad_tweet = 1
                good_thread_column.append(thread_number)
                print(thread_number + " done")
            else:
                print(thread_number + " bad tweet")
                grabbed_full_text_or_bad_tweet = 1

    full_text_column = []
    for thread in good_thread_column:
        if thread in full_text_dic.keys():
            for text in full_text_dic[thread]:
                full_text_column.append(text)

    filtered_df = df[df['thread_number'].isin(good_thread_column)]
    if len(filtered_df) == len(full_text_column):
        filtered_df["full_text"] = full_text_column
        new_csv = filename.split(".")[0]+"_full_text.csv"
        filtered_df.to_csv(new_csv)
        return filtered_df, should_double_check


In [22]:
filtered_df, should_double_check = get_full_text("five_ten_url.csv", "ISO-8859-1")
print(should_double_check)
print(filtered_df)

Thread 1starting, https://t.co/DJhIQhmVwJ
https://twitter.com/ChrisGiles_/status/999307110902050818/photo/1
Thread 1  matches correct length
Thread 1 done
Thread 2starting, https://t.co/HjfLJz6zKm
https://twitter.com/i/web/status/999345299087872000
Thread 2  matches correct length
Thread 2 done
Thread 3starting, https://t.co/1aQCx65ywX
https://twitter.com/i/web/status/999359361532219393
Thread 3  matches correct length
Thread 3 done
Thread 4starting, https://t.co/cobt9JIWEv
https://twitter.com/i/web/status/999356126641639424
Thread 4 bad tweet
Thread 5starting, https://t.co/DcgRxv9niy
https://twitter.com/i/web/status/971860842428985346
Thread 5  matches correct length
Thread 5 done
Thread 6starting, https://t.co/4g7XNTrLz4
https://twitter.com/i/web/status/971794685390934016
Thread 6 bad tweet
Thread 7starting, https://t.co/KqUH0jNZYX
https://twitter.com/i/web/status/969734011726671872
Thread 7 bad tweet
Thread 8starting, https://t.co/NNo1fBpJ1v
https://twitter.com/i/web/status/99388552

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["full_text"] = full_text_column


In [28]:
filtered_df, should_double_check = get_full_text("ten_fifteen_url.csv", "ISO-8859-1")
print(should_double_check)
print(filtered_df)

Thread 1starting, https://t.co/07rC6e4jEt
https://twitter.com/i/web/status/972205190593089538
Thread 1  matches correct length
Thread 1 done
Thread 2starting, https://t.co/Gq8Hh5LiZq
https://twitter.com/thehill/status/972606284485857281
Thread 2 bad tweet
Thread 3starting, https://t.co/7cnQHdSAvJ
https://twitter.com/i/web/status/972724814031892480
Thread 3  matches correct length
Thread 3 done
Thread 4starting, https://t.co/bXj3Ug3ebu
https://twitter.com/i/web/status/973336286433103874
Thread 4  matches correct length
Thread 4 done
Thread 5starting, https://t.co/QkEsfGlTEb
https://twitter.com/i/web/status/970747966091939844
Thread 5 bad tweet
Thread 6starting, https://t.co/9PW8K9UlPR
https://twitter.com/DaveNYviii/status/961077328628670465/photo/1
Thread 6  matches correct length
Thread 6 done
Thread 7starting, https://t.co/CB4zvHgOSa
https://twitter.com/i/web/status/973164631178076160
Thread 7  matches correct length
Thread 7 done
Thread 8starting, https://t.co/528hKCik3K
https://twit

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["full_text"] = full_text_column


In [29]:
filtered_df, should_double_check = get_full_text("fifteen_twenty_url.csv", "ISO-8859-1")
print(should_double_check)
print(filtered_df)

Thread 1starting, https://t.co/SjzJKhFRQ0
https://twitter.com/HNIJohnMiller/status/998968203681427458/photo/1
Thread 1  matches correct length
Thread 1 done
Thread 2starting, https://t.co/xXoEpEFtcS
https://twitter.com/i/web/status/998660806005768192
Thread 2 has incorrect array length, array too large
Thread 2 done
Thread 3starting, https://t.co/isi7lQfdom
https://twitter.com/i/web/status/999311515562868738
Thread 3 has incorrect array length, array too large
Thread 3 done
Thread 4starting, https://t.co/kPbsBAmsCW
https://twitter.com/i/web/status/976133078618984449
Thread 4 bad tweet
Thread 5starting, https://t.co/96IhKyyagP
https://twitter.com/i/web/status/973963426140708864
Thread 5  matches correct length
Thread 5 done
Thread 6starting, https://t.co/7RqltzB1ov
https://twitter.com/i/web/status/948227465222279168
Thread 6 has incorrect array length, array too small
Thread 6 done
Thread 7starting, https://t.co/B3fasRgzM7
https://twitter.com/i/web/status/976260629953736709
Thread 7 bad

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["full_text"] = full_text_column


In [32]:
filtered_df, should_double_check = get_full_text("twenty_twentyfive_url.csv", "ISO-8859-1")
print(should_double_check)
print(filtered_df)

Thread 1starting, https://t.co/TDFt0VdW3V
https://twitter.com/i/web/status/978656977705414657
Thread 1 bad tweet
Thread 2starting, https://t.co/cxUWZLLM3w
https://twitter.com/i/web/status/978534905041838080
Thread 2 bad tweet
Thread 3starting, https://t.co/2x2V0PgKiL
https://twitter.com/i/web/status/976113154517295105
Thread 3 bad tweet
Thread 4starting, https://t.co/Ek02KeYalD
https://twitter.com/i/web/status/969440229907103744
Thread 4 has incorrect array length, array too large
Thread 4 done
Thread 5starting, https://t.co/46HO8Hqy3e
https://twitter.com/i/web/status/972958930946048000
Thread 5  matches correct length
Thread 5 done
Thread 6starting, https://t.co/sEAlNlYURn
https://twitter.com/i/web/status/969000686196203521
Thread 6 bad tweet
Thread 7starting, https://t.co/1MLM5Sr18E
https://twitter.com/i/web/status/965718456539652096
Thread 7 bad tweet
Thread 8starting, https://t.co/1JzicUSWhP
https://twitter.com/i/web/status/995327377373777920
Thread 8 bad tweet
Thread 9starting, ht

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["full_text"] = full_text_column


In [41]:
filtered_df, should_double_check = get_full_text("twentyfive_thirty_url.csv", "ISO-8859-1")
print(should_double_check)
print(filtered_df)

Thread 1starting, https://t.co/IBkNFfm0ju
https://twitter.com/ThomasWictor/status/978874480968728576/photo/1
Thread 1 bad tweet
Thread 2starting, https://t.co/UaUCxtYL5a via @RMac18
https://t.co/UaUCxtYL5a%20via%20@RMac18
Thread 2starting, https://t.co/vKdmbRU3tt
https://twitter.com/i/web/status/979071958221574144
Thread 2 bad tweet
Thread 3starting, https://t.co/QNdcDBvb1s
https://twitter.com/i/web/status/969734870191685632
Thread 3 bad tweet
Thread 4starting, https://t.co/fybLKWrbVj
https://twitter.com/i/web/status/978816038832762881
Thread 4 bad tweet
Thread 5starting, https://t.co/Viinxnq61M
https://twitter.com/i/web/status/979095625479356422
Thread 5 bad tweet
Thread 6starting, https://t.co/YqadtwX9sL
https://twitter.com/i/web/status/979054943633133568
Thread 6  matches correct length
Thread 6 done
Thread 7starting, https://t.co/Bnz2dMHNSO
https://twitter.com/i/web/status/946831046921867264
Thread 7 bad tweet
Thread 8starting, https://t.co/qPr7BBmZBX
https://english.alarabiya.net/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["full_text"] = full_text_column


# Testing playground

In [17]:
def get_full_text_list(short_url: str):
    response = requests.get(short_url, allow_redirects=True)
    print(response.url)
    final_url = response.url
    idtemp_arr = final_url.split("/status/")
    if not len(idtemp_arr) > 1:
        return "bad_url"
    idtemp = idtemp_arr[1]
    id = idtemp.split("/")[0]
    url = f'https://threadreaderapp.com/thread/{id}.html'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    content_divs = soup.find_all('div', class_='content-tweet allow-preview')
    thread_array = []
    for div in content_divs:
        thread = div.get_text(separator="\n", strip=True)
        thread_array.append(thread)
    return thread_array

df = pd.read_csv("five_ten_url.csv", encoding="ISO-8859-1")
total_length = len(df)
full_text_dic = {}                                                  # dictionary that stores full text
thread_dic = {}                                                     # dictionary that stores correct thread length
good_thread_column = []                                             # list that stores all of the good threads
cur_thread_number = "Thread 0"                                      # default to some non existing thread number
for data in df.iterrows():                                          # goal is to get correct thread length
    thread_number = data[1]["thread_number"]
    if thread_number not in thread_dic.keys():
        thread_dic[thread_number] = 1
        full_text_dic[thread_number] = []
    else:
        thread_dic[thread_number] += 1

should_double_check = []
grabbed_full_text_or_bad_tweet = 0
for data in df.iterrows():
    thread_number = data[1]["thread_number"]
    short_url = data[1]["url"]

    # case1: same thread, already grabbed all information or bad tweet
    if thread_number == cur_thread_number and grabbed_full_text_or_bad_tweet:
        continue

    # case2: moved to the next thread
    if thread_number != cur_thread_number:
        cur_thread_number = thread_number
        grabbed_full_text_or_bad_tweet = 0
    if short_url is not None and short_url != '' and isinstance(short_url, str) and not grabbed_full_text_or_bad_tweet:
        print(thread_number + "starting, " + short_url)
        thread_arr = get_full_text_list(short_url)
        if isinstance(thread_arr, str) and thread_arr == "bad_url":
            continue
        if len(thread_arr) > 0:
            if len(thread_arr) == thread_dic[thread_number]:
                print(thread_number, " matches correct length")
            elif len(thread_arr) < thread_dic[thread_number]:
                print(thread_number, "has incorrect array length, array too small")
                should_double_check.append(thread_number)
            else:
                print(thread_number, "has incorrect array length, array too large")
                should_double_check.append(thread_number)
            for i in range(thread_dic[thread_number]):
                if i < len(thread_arr):
                    full_text_dic[thread_number].append(thread_arr[i])
                else:
                    full_text_dic[thread_number].append("")
            grabbed_full_text_or_bad_tweet = 1
            good_thread_column.append(thread_number)
            print(thread_number + " done")
        else:
            print(thread_number + " bad tweet")
            grabbed_full_text_or_bad_tweet = 1

full_text_column = []
for thread in good_thread_column:
    if thread in full_text_dic.keys():
        for text in full_text_dic[thread]:
            full_text_column.append(text)

filtered_df = df[df['thread_number'].isin(good_thread_column)]
print(len(filtered_df))
print(len(full_text_column))

Thread 1starting, https://t.co/DJhIQhmVwJ
https://twitter.com/ChrisGiles_/status/999307110902050818/photo/1
Thread 1  matches correct length
Thread 1 done
Thread 2starting, https://t.co/HjfLJz6zKm
https://twitter.com/i/web/status/999345299087872000
Thread 2  matches correct length
Thread 2 done
Thread 3starting, https://t.co/1aQCx65ywX
https://twitter.com/i/web/status/999359361532219393
Thread 3  matches correct length
Thread 3 done
Thread 4starting, https://t.co/cobt9JIWEv
https://twitter.com/i/web/status/999356126641639424
Thread 4 bad tweet
Thread 5starting, https://t.co/DcgRxv9niy
https://twitter.com/i/web/status/971860842428985346
Thread 5  matches correct length
Thread 5 done
Thread 6starting, https://t.co/4g7XNTrLz4
https://twitter.com/i/web/status/971794685390934016
Thread 6 bad tweet
Thread 7starting, https://t.co/KqUH0jNZYX
https://twitter.com/i/web/status/969734011726671872
Thread 7 bad tweet
Thread 8starting, https://t.co/NNo1fBpJ1v
https://twitter.com/i/web/status/99388552

In [19]:
print(full_text_column)
print(should_double_check)

['Extraordinary evidence at Treasury committee from Jon Thompson, CEO of HMRC on customs and Brexit today', "The Brexiter favourite Max Fac - would cost business between £17 and £20bn a year\n- that's almost 1% of GDP\n- just for filling in forms\nThanks #Brexit", 'How does he arrive at the figure\n200m export consignments at an average cost of £32.50 each = £6.5bn (times two because two way traffic)\nplus around £4 to £7bn of rules of origin compliance from filling in other forms', 'Theresa May\'s New Customs Partnership is much cheaper for business (almost zero cost) because it seeks to replicate today\'s arrangements but is thought to be "cretinous" by brexiters and "magical thinking" by the EU27...\n...and', 'Mr Thompson said he did not expect the EU to reciprocate over the customs partnership.\nWhat that means is UK collects tariffs for EU and hands it over when a ship lands in Felixtowe and drives to Calais, but if ship first lands in Rotterdam, EU keeps the import tariffs.', 'Bo

In [16]:
output = {}
count = 0
for thread in full_text_column.keys():
    templen = len(full_text_column[thread])
    output[thread] = templen
    count += templen

print(output)

filtered_df = df[df['thread_number'].isin(good_thread_column)]
print(len(filtered_df))
print(count)
# print(len(full_text_column))

['Thread 1', 'Thread 2', 'Thread 3', 'Thread 5', 'Thread 10', 'Thread 12', 'Thread 17', 'Thread 18', 'Thread 20', 'Thread 21', 'Thread 22', 'Thread 26', 'Thread 27', 'Thread 28', 'Thread 29', 'Thread 32', 'Thread 36', 'Thread 37', 'Thread 38', 'Thread 41', 'Thread 42', 'Thread 43', 'Thread 51', 'Thread 52', 'Thread 54', 'Thread 61', 'Thread 63', 'Thread 66', 'Thread 68', 'Thread 71', 'Thread 72', 'Thread 73', 'Thread 74', 'Thread 76', 'Thread 82', 'Thread 86', 'Thread 87', 'Thread 88', 'Thread 89', 'Thread 91', 'Thread 92', 'Thread 93', 'Thread 94', 'Thread 95', 'Thread 96', 'Thread 97', 'Thread 98', 'Thread 100', 'Thread 101']
{'Thread 1': ['Extraordinary evidence at Treasury committee from Jon Thompson, CEO of HMRC on customs and Brexit today', "The Brexiter favourite Max Fac - would cost business between £17 and £20bn a year\n- that's almost 1% of GDP\n- just for filling in forms\nThanks #Brexit", 'How does he arrive at the figure\n200m export consignments at an average cost of £32.

In [37]:
# short_url = 'https://t.co/YqadtwX9sL'
# response = requests.get(short_url, allow_redirects=True)
# final_url = response.url
# idtemp = final_url.split("/status/")[1]
# id = idtemp.split("/")[0]
# # print(f"The id is: {id}")
# url = f'https://threadreaderapp.com/thread/{id}.html'
# response = requests.get(url)
# soup = BeautifulSoup(response.content, 'html.parser')
# content_divs = soup.find_all('div', class_='content-tweet allow-preview')
# # print(content_divs)
# thread_array = []
# for div in content_divs:
#     thread = div.get_text(separator="\n", strip=True)
#     print(thread)
#     thread_array.append(thread)
#     print("-" * 80)
# print(thread_array)

def test_url(url):
    try:
        if "https://" not in short_url:
            return "bad_url"
        response = requests.get(short_url, allow_redirects=True)
        print(response.url)
        final_url = response.url
        if "https://twitter.com" not in final_url:
            return "bad_url"
        idtemp_arr = final_url.split("/status/")
        if not len(idtemp_arr) > 1:
            return "bad_url"
        idtemp = idtemp_arr[1]
        id = idtemp.split("/")[0]
        url = f'https://threadreaderapp.com/thread/{id}.html'
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        content_divs = soup.find_all('div', class_='content-tweet allow-preview')
        thread_array = []
        for div in content_divs:
            thread = div.get_text(separator="\n", strip=True)
            thread_array.append(thread)
        return thread_array
    except:
        return "bad_url"

test_url("https://t.co/YqadtwX9sL")

https://twitter.com/i/web/status/979054943633133568


['(1)Once upon a crime, in the land of Democrapia on the continent of Liberalia lived a HOE named Stormy.\nDemocrapia, which was once called the United States, had fallen on hard times since the DemaGODS, as they now called themselves, had taken over.\n#disneyprincess\nExternal Tweet loading...\nIf nothing shows, it may have been deleted\nby @Education4Libs\nview original on Twitter',
 '(2)\nAll around Democrapia buildings were crumbling where great ones once stood, and all the apartments had been sub divided into even smaller and smaller apartments, the only exception to this was capital hill, where the great mansions of the DemoGods stood!\n#disneyprincess',
 '(3)\nStormy has been raised in one of those small apartments called a 3x3 grand safespace of happiness. The 3X3 was believed 2 have come from the subdivision of a 3 bedroom apt into 3 smaller 3 bedroom apts, but that was not an accepted “fact” by the Demagods,\n#disneyprincess',
 '(4)\nEveryone was hopeful of the DemaGODS promi