# TASK 1

## Get Tweets from Twitter ##

I am using the snscrape library to scrape data from Twitter
https://github.com/JustAnotherArchivist/snscrape

In [20]:
# import libraries
import snscrape.modules.twitter as sntwitter
import pandas as pd
import csv
import os
import re
import string

#### Date of Data Collection: 19 November 2022
#### Time of Data Collection: 5:01 PM PKT

In [21]:
# get tweets using the api
tweets = []

for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:LeoVaradkar').get_items()):
    if i>=1000:
        break
    tweets.append([tweet.rawContent]) 


In [31]:
print(tweets[0:3])

# store tweets in a csv file
path = os.path.dirname(os.path.abspath("24100004_Phase1.ipynb"))
f = open(path + "/LeoVaradkar_task1.csv", "w")
writer = csv.writer(f)

for tweet in tweets:
    writer.writerow(tweet)


[['This graph really tells a story. Follow the green line. No reason why most of our electricity should not be renewable within a few years. https://t.co/hnXrBZFagS'], ['Speaker Pelosi has been one of Ireland’s most steadfast supporters and allies during her term of office. She is a force to be reckoned with. Wishing her and Paul the very best in the next chapter of their lives https://t.co/74EMUN2EJt'], ['The next exciting phase of the National Sports Campus starts here today in #dubw with plans for a velodrome and much much more https://t.co/NdOWtQafg3']]


# TASK 2

In [32]:

cleaned = [None] * len(tweets)

emojis = re.compile("["  u"\U0001F600-\U0001F64F"  u"\U0001F300-\U0001F5FF"  u"\U0001F680-\U0001F6FF"   u"\U0001F1E0-\U0001F1FF"   u"\U00002500-\U00002BEF"   u"\U00002702-\U000027B0"  u"\U00002702-\U000027B0"   u"\U000024C2-\U0001F251"   u"\U0001f926-\U0001f937"   u"\U00010000-\U0010ffff"   u"\u2640-\u2642"   u"\u2600-\u2B55"   u"\u200d"   u"\u23cf"   u"\u23e9"   u"\u231a"   u"\ufe0f"   u"\u3030"   "]+", flags=re.UNICODE)

for i in range(len(tweets)):
    cleaned[i] = re.sub("-", " ", tweets[i][0]) # convert - to space
    cleaned[i] = re.sub(",", " ", cleaned[i]) # convert , to space
    cleaned[i] = re.sub("\d", "", cleaned[i]) # remove digits
    cleaned[i] = re.sub("['؛$'–،٫’?؟۔٪\/:\"|!()°.;″′\-]", "", cleaned[i]) # remove punctuation marks
    cleaned[i] = re.sub("http\S+", "", cleaned[i]) # remove links
    cleaned[i] = re.sub("\n", "", cleaned[i]) # remove new line characters
    cleaned[i] = re.sub("<.*?>", " ", cleaned[i]) # remove html characters
    cleaned[i] = emojis.sub(r'', cleaned[i]) # remove emojis
    cleaned[i] = cleaned[i].lower() # convert to lower case
    cleaned[i] = re.sub("&\w+", "", cleaned[i]) # remove any & words
    cleaned[i] = re.sub("\u2066", "", cleaned[i]) # remove any \u words
    cleaned[i] = re.sub("\xa0", "", cleaned[i]) # remove any \x words
    cleaned[i] = re.sub(u"\N{euro sign}", "", cleaned[i]) # remove any \x words
   
# load stop words
f = open(path + "/stop_words.txt", "r")
stopWords = f.read()
stopWords = stopWords.split("\n")

# remove stop words and ''
print(len(cleaned))
for i in range(len(cleaned)):
    cleaned[i] = cleaned[i].split(" ")
    while('' in cleaned[i]):
        cleaned[i].remove('')
    for j in range(len(stopWords)):
        while(stopWords[j] in cleaned[i]):
                cleaned[i].remove(stopWords[j])
 
# convert back to 1d arr
for i in range(len(cleaned)):
    cleaned[i] = " ".join(cleaned[i])

df = pd.DataFrame(cleaned, columns=None)
df.to_csv("LeoVaradkar_task2.csv", index=False, encoding="utf-8", header=None)




1000


# TASK 3

In [33]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(cleaned, test_size=0.2, shuffle=True)


def createVocabulary(X_train):
    for i in range(len(X_train)):
        X_train[i] = X_train[i].split(" ")
    # create vocabulary of words
    allvocab = []
    # step 1: add all the words in the dataset 
    for i in range(len(X_train)):
        for j in range(len(X_train[i])):
            allvocab.append(X_train[i][j])



    # step 3: remove any duplicates characters
    vocab = []
    for word in allvocab:
        if word not in vocab:
            vocab.append(word)
    
    return vocab

vocab = createVocabulary(X_train)
print(vocab)


['ill', 'talking', '@cooper_m', 'shortly', '@todayfm', 'strong', 'economy', 'means', 'help', 'people', 'put', 'money', 'back', 'pocket', 'reduce', 'cost', 'living', 'good', 'day', 'letterkenny', 'fintru', 'creating', 'new', 'jobs', 'talent', 'skills', 'region', 'doubt', 'important', 'factor', 'decision', '#fullemployment', 'august', 'message', 'me@deptenterprise', 'pleased', 'welcome', '@daracalleary', 'department', 'enterprise', 'trade', 'employment', 'know', 'well', 'work', 'together', 'big', 'issues', 'facing', 'business', 'workers', 'consumers', 'national', 'maternity', 'hospital', 'approved', 'cabinet', 'today', 'plan', 'go', 'tender', 'within', 'months', 'hopefully', 'construction', 'next', 'year', 'take', 'years', 'build', 'fit', 'commission', 'best', 'wishes', 'james', 'excellent', 'candidate', 'look', 'forward', 'seeing', 'campaign', 'trail', 'remarkable', 'results', 'despite', 'challenges', 'irish', 'managed', 'record', 'exports', 'behind', 'figures', 'thousands', 'every', 'c

In [34]:
print(len(vocab))

4087


In [35]:
# create bag of words
def create_bags(vocab, cleaned):
    X_complete = []
    all_dicts = []
    arr = [0] * len(vocab)

    for i in range(len(cleaned)):
        cleaned_dict = dict(zip(vocab, arr))
                
        all_dicts.append(cleaned_dict)

    for i in range(len(cleaned)):
        cleaned[i] = cleaned[i].split(" ")

    # # go over every word in the vocab
    #     if that word is in your cleaned[i]
    #         vocab[word] += 1

    for i in range(len(cleaned)):
        for w in cleaned[i]:
            if w in vocab:
                all_dicts[i][w] += 1
        # get values
        bag_of_words = list(all_dicts[i].values())
        X_complete.append(bag_of_words)

    # laplace smoothing - increment number of words by one 
    for i in range(len(X_complete)):
        for j in range(len(X_complete[i])):
            X_complete[i][j] += 1
        

    return X_complete 
      
X_complete = create_bags(vocab, cleaned)


In [36]:
x = 0
flag = False
print("X_train BOW:")
for i in range(len(cleaned)):
    for j in range(len(X_train)):
        if cleaned[i] == X_train[j] and x < 10:
            print(X_complete[i])
            x += 1


X_train BOW:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [37]:
for i in range(len(X_test)):
    X_test[i] = X_test[i].split(" ")

x = 0
flag = False
print("X_test BOW:")
for i in range(len(cleaned)):
    for j in range(len(X_test)):
        if cleaned[i] == X_test[j] and x < 10:
            print(X_complete[i])
            x += 1




X_test BOW:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [38]:
print("10 raw tweets")
print(tweets[0:10])

print("\n10 cleaned tweets")
print(cleaned[0:10])

10 raw tweets
[['This graph really tells a story. Follow the green line. No reason why most of our electricity should not be renewable within a few years. https://t.co/hnXrBZFagS'], ['Speaker Pelosi has been one of Ireland’s most steadfast supporters and allies during her term of office. She is a force to be reckoned with. Wishing her and Paul the very best in the next chapter of their lives https://t.co/74EMUN2EJt'], ['The next exciting phase of the National Sports Campus starts here today in #dubw with plans for a velodrome and much much more https://t.co/NdOWtQafg3'], ['I’ll be speaking with Lynsey Dolan on @midlands103 shortly about the @FineGael Ard Fheis taking place in Athlone this weekend. #FGAF22 \n\nhttps://t.co/VcntfyXMxp'], ['Congratulations to @DLA_Piper who plan to double their workforce in Ireland. \n\nIreland is home to a highly experienced legal sector, which is testament to @DLA_Piper’s \xa0success in reaching its 100-employee milestone since it first established here