In [None]:
import pandas as pd

In [109]:
def check_filter(filename: str, encoding: str):
    df = pd.read_csv(filename, encoding=encoding)
    total_length = len(df)
    # print(df.head)
    for col in df.columns:
        has_null = df[col].isnull().any()
        if has_null:
            return f"{filename} has null value at {col} \n"
        if len(df[col]) != total_length:
            return f"{filename} has incorrect column length at {col} \n"
    df1 = df.drop_duplicates()
    if len(df1) != total_length:
        return f"{filename} has duplicate rows \n"
    return f"{filename} check filter success \n"

# columns are ['id', 'thread_number', 'timestamp', 'text', 'retweets', 'likes', 'replies']
def check_column_types(filename: str, encoding: str):
    df = pd.read_csv(filename, encoding=encoding)
    if (df['timestamp'].dtype != int) or (df['retweets'].dtype != int) or (df['likes'].dtype != int) or (df['replies'].dtype != int):
        return f"{filename} has incorrect type \n"
    has_negatives1 = (df['retweets'] < 0).any()
    has_negatives2 = (df['likes'] < 0).any()
    has_negatives3 = (df['replies'] < 0).any()
    if has_negatives1 or has_negatives2 or has_negatives3:
        return f"{filename} has negative engagement values \n"
    for thread in df['thread_number']:
        if type(thread) != str:
            return f"{filename} has incorrect type, thread_number not a string \n"
        if thread[0:6] != "Thread":
            return f"{filename} has incorrect name for thread_number \n"
    return f"{filename} check column type success \n"

def add_url_column(filename: str, encoding: str):
    df = pd.read_csv(filename, encoding=encoding)
    total_length = len(df)
    url_column = []
    for data in df.iterrows():
        if data[1]["text"]:
            url_arr = data[1]["text"].split("http")
            if len(url_arr) > 1:
                true_url = "http" + url_arr[1]
                url_column.append(true_url)
                # print(true_url)
            else:
                url_column.append("")
        else:
            url_column.append("")
    if len(url_column) == total_length:
        df["url"] = url_column
        new_csv = filename.split(".")[0]+"_url.csv"
        df.to_csv(new_csv)
        return df
    return 0


In [None]:
# check_filter("five_ten.csv", encoding="ISO-8859-1")
# check_column_types("five_ten.csv", encoding="ISO-8859-1")
# add_url_column("five_ten.csv", encoding="ISO-8859-1")

Unnamed: 0,id,thread_number,timestamp,text,retweets,likes,replies,url
0,999307110902050818,Thread 1,1527088356,Extraordinary evidence at Treasury committee f...,66,59,5,https://t.co/DJhIQhmVwJ
1,999307395712143360,Thread 1,1527088424,The Brexiter favourite Max Fac - would cost bu...,83,107,10,https://t.co/0MwIcwre4t
2,999307826265812992,Thread 1,1527088526,How does he arrive at the figure\r\r\n\r\r\n20...,6,11,2,https://t.co/KxnkU2QiVO
3,999308153346052102,Thread 1,1527088604,Theresa May's New Customs Partnership is much ...,7,10,1,https://t.co/0LcsJHah0H
4,999308653894230022,Thread 1,1527088724,Mr Thompson said he did not expect the EU to r...,17,12,2,https://t.co/9c3uhhnZGX
...,...,...,...,...,...,...,...,...
801,977334201170063360,Thread 101,1521849606,2) Congress basically screwed themselves by no...,33,40,2,https://t.co/ommnR24kPT
802,977334207503515648,Thread 101,1521849607,5) @POTUS must have realized that those â??app...,36,60,3,https://t.co/pIHLTyn6eO
803,977334205469241344,Thread 101,1521849607,4) He spent the money or didnâ??t spend it how...,21,46,5,https://t.co/gxFEaYr3R6
804,977334211743924224,Thread 101,1521849608,6) What if @POTUS decided to tell the #Treasur...,55,84,5,https://t.co/wT76hT941U


In [108]:
print("Inital check for all datasets: ")
check_filter("five_ten.csv", encoding="ISO-8859-1")
check_filter("ten_fifteen.csv", encoding="ISO-8859-1")
check_filter("fifteen_twenty.csv", encoding="ISO-8859-1")
check_filter("twenty_twentyfive.csv", encoding="ISO-8859-1")
check_filter("twentyfive_thirty.csv", encoding="ISO-8859-1")
print("Done. ")

print("Column check for all datasets: ")
check_column_types("five_ten.csv", encoding="ISO-8859-1")
check_column_types("ten_fifteen.csv", encoding="ISO-8859-1")
check_column_types("fifteen_twenty.csv", encoding="ISO-8859-1")
check_column_types("twenty_twentyfive.csv", encoding="ISO-8859-1")
check_column_types("twentyfive_thirty.csv", encoding="ISO-8859-1")
print("Done. ")

Inital check for all datasets: 
Done. 
Column check for all datasets: 
Done. 


In [110]:
print("Generating urls for all datasets: ")
add_url_column("five_ten.csv", encoding="ISO-8859-1")
add_url_column("ten_fifteen.csv", encoding="ISO-8859-1")
add_url_column("fifteen_twenty.csv", encoding="ISO-8859-1")
add_url_column("twenty_twentyfive.csv", encoding="ISO-8859-1")
add_url_column("twentyfive_thirty.csv", encoding="ISO-8859-1")
print("Done. ")

Generating urls for all datasets: 
Done. 


In [None]:
import chardet

# Detect encoding
with open('five_ten.csv', 'rb') as file:
    print(chardet.detect(file.read()))

{'encoding': 'ISO-8859-1', 'confidence': 0.7222415887282886, 'language': ''}


In [111]:
import requests
from bs4 import BeautifulSoup

short_url = 'https://t.co/6pbG0zmYfQ'
response = requests.get(short_url, allow_redirects=True)
final_url = response.url
idtemp = final_url.split("/status/")[1]
id = idtemp.split("/")[0]
print(f"The id is: {id}")

url = f'https://threadreaderapp.com/thread/{id}.html'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
content_divs = soup.find_all('div', class_='content-tweet allow-preview')
print(content_divs)

thread_array = []
for div in content_divs:
  thread = div.get_text(separator="\n", strip=True)
  print(thread)
  thread_array.append(thread)
  print("-" * 80)
print(thread_array)

The id is: 970189432581230593
[<div class="content-tweet allow-preview" data-action="click-&gt;thread#showTweet" data-controller="thread" data-screenname="Fr33dom4theWIN" data-tweet="970186624985391110" dir="auto" id="tweet_1">
ALAS NEW <a class="entity-hashtag" href="/hashtag/QAnon">#QAnon</a> !!!<br/>
<br/>
<a class="entity-mention" href="https://twitter.com/POTUS">@POTUS</a> is working to save us all! <a class="entity-hashtag" href="/hashtag/TheGreatAwakening">#TheGreatAwakening</a> <a class="entity-hashtag" href="/hashtag/TheStormIsHere">#TheStormIsHere</a> <a class="entity-hashtag" href="/hashtag/MAGA">#MAGA</a> <span class="entity-image"><img alt="" data-src="https://pbs.twimg.com/media/DXbLlzxWAAAeu0Q.jpg" src="/images/1px.png"/></span>
<sup class="tw-permalink"><i class="fas fa-link"></i></sup>
</div>, <div class="content-tweet allow-preview" data-action="click-&gt;thread#showTweet" data-controller="thread" data-screenname="Fr33dom4theWIN" data-tweet="970189432581230593" dir="a

In [None]:
import requests

# Shortened URL
short_url = 'https://t.co/sS5m7msvd5'

# Send a request to the short URL and allow it to follow redirects
response = requests.get(short_url, allow_redirects=True)

# The final URL after all redirects
final_url = response.url

print(f"The final URL is: {final_url}")

idtemp = final_url.split("/status/")[1]
print(idtemp)
print(idtemp.split("/")[0])

The final URL is: https://twitter.com/i/web/status/998995611650347008
998995611650347008
998995611650347008
