# All necessary imports.

In [112]:
import pandas as pd
import chardet
import requests
from bs4 import BeautifulSoup

# Checking the encoding of the raw datasets.

In [113]:
# check encoding, should be encoding="ISO-8859-1".
# note: "Windows-1252" is a superset of "ISO-8859-1"
def check_encoding(filename: str):
    with open(filename, 'rb') as file:
        print(chardet.detect(file.read()))

check_encoding("five_ten.csv")
check_encoding("ten_fifteen.csv")
check_encoding("fifteen_twenty.csv")
check_encoding("twenty_twentyfive.csv")
check_encoding("twentyfive_thirty.csv")

{'encoding': 'ISO-8859-1', 'confidence': 0.7222415887282886, 'language': ''}
{'encoding': 'ISO-8859-1', 'confidence': 0.721592522749421, 'language': ''}
{'encoding': 'ISO-8859-1', 'confidence': 0.7201988424168467, 'language': ''}
{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}
{'encoding': 'Windows-1252', 'confidence': 0.7299693486436825, 'language': ''}


# Checking null, empty, incorrect type for all data rows.

In [109]:
def check_filter(filename: str, encoding: str):
    df = pd.read_csv(filename, encoding=encoding)
    total_length = len(df)
    # print(df.head)
    for col in df.columns:
        has_null = df[col].isnull().any()
        if has_null:
            return f"{filename} has null value at {col} \n"
        if len(df[col]) != total_length:
            return f"{filename} has incorrect column length at {col} \n"
    df1 = df.drop_duplicates()
    if len(df1) != total_length:
        return f"{filename} has duplicate rows \n"
    return f"{filename} check filter success \n"

# columns are ['id', 'thread_number', 'timestamp', 'text', 'retweets', 'likes', 'replies']
def check_column_types(filename: str, encoding: str):
    df = pd.read_csv(filename, encoding=encoding)
    if (df['timestamp'].dtype != int) or (df['retweets'].dtype != int) or (df['likes'].dtype != int) or (df['replies'].dtype != int):
        return f"{filename} has incorrect type \n"
    has_negatives1 = (df['retweets'] < 0).any()
    has_negatives2 = (df['likes'] < 0).any()
    has_negatives3 = (df['replies'] < 0).any()
    if has_negatives1 or has_negatives2 or has_negatives3:
        return f"{filename} has negative engagement values \n"
    for thread in df['thread_number']:
        if type(thread) != str:
            return f"{filename} has incorrect type, thread_number not a string \n"
        if thread[0:6] != "Thread":
            return f"{filename} has incorrect name for thread_number \n"
    return f"{filename} check column type success \n"

def add_url_column(filename: str, encoding: str):
    df = pd.read_csv(filename, encoding=encoding)
    total_length = len(df)
    url_column = []
    for data in df.iterrows():
        if data[1]["text"]:
            url_arr = data[1]["text"].split("http")
            if len(url_arr) > 1:
                true_url = "http" + url_arr[1]
                url_column.append(true_url)
                # print(true_url)
            else:
                url_column.append("")
        else:
            url_column.append("")
    if len(url_column) == total_length:
        df["url"] = url_column
        new_csv = filename.split(".")[0]+"_url.csv"
        df.to_csv(new_csv)
        return df
    return 0


In [None]:
# check_filter("five_ten.csv", encoding="ISO-8859-1")
# check_column_types("five_ten.csv", encoding="ISO-8859-1")
# add_url_column("five_ten.csv", encoding="ISO-8859-1")

Unnamed: 0,id,thread_number,timestamp,text,retweets,likes,replies,url
0,999307110902050818,Thread 1,1527088356,Extraordinary evidence at Treasury committee f...,66,59,5,https://t.co/DJhIQhmVwJ
1,999307395712143360,Thread 1,1527088424,The Brexiter favourite Max Fac - would cost bu...,83,107,10,https://t.co/0MwIcwre4t
2,999307826265812992,Thread 1,1527088526,How does he arrive at the figure\r\r\n\r\r\n20...,6,11,2,https://t.co/KxnkU2QiVO
3,999308153346052102,Thread 1,1527088604,Theresa May's New Customs Partnership is much ...,7,10,1,https://t.co/0LcsJHah0H
4,999308653894230022,Thread 1,1527088724,Mr Thompson said he did not expect the EU to r...,17,12,2,https://t.co/9c3uhhnZGX
...,...,...,...,...,...,...,...,...
801,977334201170063360,Thread 101,1521849606,2) Congress basically screwed themselves by no...,33,40,2,https://t.co/ommnR24kPT
802,977334207503515648,Thread 101,1521849607,5) @POTUS must have realized that those â??app...,36,60,3,https://t.co/pIHLTyn6eO
803,977334205469241344,Thread 101,1521849607,4) He spent the money or didnâ??t spend it how...,21,46,5,https://t.co/gxFEaYr3R6
804,977334211743924224,Thread 101,1521849608,6) What if @POTUS decided to tell the #Treasur...,55,84,5,https://t.co/wT76hT941U


In [108]:
print("Inital check for all datasets: ")
check_filter("five_ten.csv", encoding="ISO-8859-1")
check_filter("ten_fifteen.csv", encoding="ISO-8859-1")
check_filter("fifteen_twenty.csv", encoding="ISO-8859-1")
check_filter("twenty_twentyfive.csv", encoding="ISO-8859-1")
check_filter("twentyfive_thirty.csv", encoding="ISO-8859-1")
print("Done. ")

print("Column check for all datasets: ")
check_column_types("five_ten.csv", encoding="ISO-8859-1")
check_column_types("ten_fifteen.csv", encoding="ISO-8859-1")
check_column_types("fifteen_twenty.csv", encoding="ISO-8859-1")
check_column_types("twenty_twentyfive.csv", encoding="ISO-8859-1")
check_column_types("twentyfive_thirty.csv", encoding="ISO-8859-1")
print("Done. ")

Inital check for all datasets: 
Done. 
Column check for all datasets: 
Done. 


In [110]:
print("Generating urls for all datasets: ")
add_url_column("five_ten.csv", encoding="ISO-8859-1")
add_url_column("ten_fifteen.csv", encoding="ISO-8859-1")
add_url_column("fifteen_twenty.csv", encoding="ISO-8859-1")
add_url_column("twenty_twentyfive.csv", encoding="ISO-8859-1")
add_url_column("twentyfive_thirty.csv", encoding="ISO-8859-1")
print("Done. ")

Generating urls for all datasets: 
Done. 


# Grabbing full content for all datasets using url.

In [176]:
def get_full_text(filename: str, encoding:str):
    df = pd.read_csv(filename, encoding=encoding)
    total_length = len(df)
    full_text_column = []
    good_thread_column = []
    cur_thread_number = "Thread 0" # default to some non existing thread number
    grabbed_full_text_or_bad_tweet = 0
    for data in df.iterrows():
        thread_number = data[1]["thread_number"]
        short_url = data[1]["url"]

        # case1: same thread, already grabbed all information or bad tweet
        if thread_number == cur_thread_number and grabbed_full_text_or_bad_tweet:
            continue

        # case2: moved to the next thread
        if thread_number != cur_thread_number:
            cur_thread_number = thread_number
            grabbed_full_text_or_bad_tweet = 0
        if short_url is not None and short_url != '' and isinstance(short_url, str) and not grabbed_full_text_or_bad_tweet:
            print(thread_number + "starting, " + short_url)
            thread_arr = get_full_text_list(short_url)
            if isinstance(thread_arr, str) and thread_arr == "bad_url":
                continue
            if len(thread_arr) > 0:
                for i in range(len(thread_arr)):
                    full_text_column.append(thread_arr[i])
                grabbed_full_text_or_bad_tweet = 1
                good_thread_column.append(thread_number)
                print(thread_number + " done")
            else:
                print(thread_number + " bad tweet")
                grabbed_full_text_or_bad_tweet = 1

    print(len(full_text_column))
    print(total_length)
    return good_thread_column, full_text_column

def get_full_text_list(short_url: str):
    response = requests.get(short_url, allow_redirects=True)
    print(response.url)
    final_url = response.url
    idtemp_arr = final_url.split("/status/")
    if not len(idtemp_arr) > 1:
        return "bad_url"
    idtemp = idtemp_arr[1]
    id = idtemp.split("/")[0]
    # print(f"The id is: {id}")
    url = f'https://threadreaderapp.com/thread/{id}.html'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    content_divs = soup.find_all('div', class_='content-tweet allow-preview')
    # print(content_divs)
    thread_array = []
    for div in content_divs:
        thread = div.get_text(separator="\n", strip=True)
        # print(thread)
        thread_array.append(thread)
        # print("-" * 80)
        # print(thread_array)
    return thread_array

def generate_new_df(filename: str, encoding: str, good_thread: list, full_text: list):
    df = pd.read_csv(filename, encoding=encoding)
    filtered_df = df[df['thread_number'].isin(good_thread)]
    if (len(filtered_df) == len(full_text)):
        filtered_df["full_text"] = full_text
        new_csv = filename.split(".")[0]+"_full_text.csv"
        df.to_csv(new_csv)
        return df
    return 0

def generate_full_text(filename: str, encoding:str):
    good_thread_column, full_text_column = get_full_text(filename, encoding)
    output = generate_new_df(filename, encoding, good_thread_column, full_text_column)
    return output


In [178]:
# generate_full_text("five_ten_url.csv", encoding="ISO-8859-1")
good_thread_column, full_text_column = get_full_text("five_ten_url.csv", "ISO-8859-1")
output = generate_new_df("five_ten_url.csv", "ISO-8859-1", good_thread_column, full_text_column)

Thread 1starting, https://t.co/DJhIQhmVwJ
https://twitter.com/ChrisGiles_/status/999307110902050818/photo/1
Thread 1 done
Thread 2starting, https://t.co/HjfLJz6zKm
https://twitter.com/i/web/status/999345299087872000
Thread 2 done
Thread 3starting, https://t.co/1aQCx65ywX
https://twitter.com/i/web/status/999359361532219393
Thread 3 done
Thread 4starting, https://t.co/cobt9JIWEv
https://twitter.com/i/web/status/999356126641639424
Thread 4 bad tweet
Thread 5starting, https://t.co/DcgRxv9niy
https://twitter.com/i/web/status/971860842428985346
Thread 5 done
Thread 6starting, https://t.co/4g7XNTrLz4
https://twitter.com/i/web/status/971794685390934016
Thread 6 bad tweet
Thread 7starting, https://t.co/KqUH0jNZYX
https://twitter.com/i/web/status/969734011726671872
Thread 7 bad tweet
Thread 8starting, https://t.co/NNo1fBpJ1v
https://twitter.com/i/web/status/993885523503714305
Thread 8 bad tweet
Thread 9starting, https://t.co/1yJcxAPd15
https://twitter.com/i/web/status/970227980306755585
Thread 9

In [198]:
print(len(good_thread_column))
print(len(full_text_column))
df1 = pd.read_csv("five_ten_url.csv", encoding="ISO-8859-1")
filtered_df = df1[df1['thread_number'].isin(good_thread_column)]
print(df1["thread_number"])
print(filtered_df["thread_number"])
print(len(filtered_df["thread_number"].unique()))

for i in full_text_column:
    print(i)
    print("-----")

with open('output_five_ten.txt', 'w') as file:
    for item in full_text_column:
        file.write(f"{item}\n")
        file.write("-----\n")

with open('output_five_ten_thread.txt', 'w') as file:
    for item in good_thread_column:
        file.write(f"{item}\n")

49
426
0        Thread 1
1        Thread 1
2        Thread 1
3        Thread 1
4        Thread 1
          ...    
801    Thread 101
802    Thread 101
803    Thread 101
804    Thread 101
805    Thread 101
Name: thread_number, Length: 806, dtype: object
0        Thread 1
1        Thread 1
2        Thread 1
3        Thread 1
4        Thread 1
          ...    
801    Thread 101
802    Thread 101
803    Thread 101
804    Thread 101
805    Thread 101
Name: thread_number, Length: 390, dtype: object
49
Extraordinary evidence at Treasury committee from Jon Thompson, CEO of HMRC on customs and Brexit today
-----
The Brexiter favourite Max Fac - would cost business between £17 and £20bn a year
- that's almost 1% of GDP
- just for filling in forms
Thanks #Brexit
-----
How does he arrive at the figure
200m export consignments at an average cost of £32.50 each = £6.5bn (times two because two way traffic)
plus around £4 to £7bn of rules of origin compliance from filling in other forms
-----
Theresa

In [187]:
df1 = pd.read_csv("five_ten.csv", encoding="ISO-8859-1")
print(df1)

                     id thread_number   timestamp  \
0    999307110902050818      Thread 1  1527088356   
1    999307395712143360      Thread 1  1527088424   
2    999307826265812992      Thread 1  1527088526   
3    999308153346052102      Thread 1  1527088604   
4    999308653894230022      Thread 1  1527088724   
..                  ...           ...         ...   
801  977334201170063360    Thread 101  1521849606   
802  977334207503515648    Thread 101  1521849607   
803  977334205469241344    Thread 101  1521849607   
804  977334211743924224    Thread 101  1521849608   
805  977334213564186624    Thread 101  1521849609   

                                                  text  retweets  likes  \
0    Extraordinary evidence at Treasury committee f...        66     59   
1    The Brexiter favourite Max Fac - would cost bu...        83    107   
2    How does he arrive at the figure\r\r\n\r\r\n20...         6     11   
3    Theresa May's New Customs Partnership is much ...         

In [None]:
import requests

# Shortened URL
short_url = 'https://t.co/sS5m7msvd5'

# Send a request to the short URL and allow it to follow redirects
response = requests.get(short_url, allow_redirects=True)

# The final URL after all redirects
final_url = response.url

print(f"The final URL is: {final_url}")

idtemp = final_url.split("/status/")[1]
print(idtemp)
print(idtemp.split("/")[0])



The final URL is: https://twitter.com/i/web/status/998995611650347008
998995611650347008
998995611650347008


In [146]:
short_url = 'https://t.co/DJhIQhmVwJ'
response = requests.get(short_url, allow_redirects=True)
final_url = response.url
idtemp = final_url.split("/status/")[1]
id = idtemp.split("/")[0]
# print(f"The id is: {id}")
url = f'https://threadreaderapp.com/thread/{id}.html'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
content_divs = soup.find_all('div', class_='content-tweet allow-preview')
# print(content_divs)
thread_array = []
for div in content_divs:
    thread = div.get_text(separator="\n", strip=True)
    print(thread)
    thread_array.append(thread)
    print("-" * 80)
print(thread_array)

Extraordinary evidence at Treasury committee from Jon Thompson, CEO of HMRC on customs and Brexit today
--------------------------------------------------------------------------------
The Brexiter favourite Max Fac - would cost business between £17 and £20bn a year
- that's almost 1% of GDP
- just for filling in forms
Thanks #Brexit
--------------------------------------------------------------------------------
How does he arrive at the figure
200m export consignments at an average cost of £32.50 each = £6.5bn (times two because two way traffic)
plus around £4 to £7bn of rules of origin compliance from filling in other forms
--------------------------------------------------------------------------------
Theresa May's New Customs Partnership is much cheaper for business (almost zero cost) because it seeks to replicate today's arrangements but is thought to be "cretinous" by brexiters and "magical thinking" by the EU27...
...and
--------------------------------------------------------