In [131]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import nlpaug.augmenter.word as naw
from gensim.models import KeyedVectors

In [132]:
'''
Preprocessing

a. Load the data into a dataframe. 
    Name it "jungyu_df", and examine the data. 
    You will notice that the file has a header and four tweets with their sentiments.
'''

filepath = 'C:/Users/Public/6th/NLP and Recommender Systems/Assignment1/COVID19_mini.csv'

jungyu_df = pd.read_csv(filepath)
jungyu_df.head()

Unnamed: 0,sentiment,text,user
0,positive,"RT @ashish_vaidya1: @Bhupendrapbjp sir, keshod mamlatdar office says they didnt receive grant for covid19 ex-gratia claim from govt. Many c…",ashish_vaidya1
1,negative,RT @TheAngryEpi: Informally reached out to a few epidemiologists about an open position in Arizona. Their response: “I would not work in AZ…,greensnow
2,positive,"RT @sinovic: If this is a joke from the @DMRegister it really isn’t funny. If it’s serious, they are completely out of touch from reality w…",RadioBradshaw
3,negative,"RT @MicahPollak: Well, #COVID19 is once again the leading cause of death in #Indiana (based on average daily deaths) and closing in on (aga…",KristinaTraxler


In [133]:
'''
b. Drop the user column.
'''
jungyu_df = jungyu_df.drop(columns=['user'])

In [134]:
'''
c. Use regular expressions or python string methods to get rid of the additional data at the begging and end of each tweet.
d. Check the tweet data and identify, if you need to carry out any further pre-processing steps, you should at least do two or three more steps. 
'''
pd.set_option('display.max_colwidth', None)
for i, text in enumerate(jungyu_df['text']):
    print(f'Tweet {i+1}:\n{text}\n')

Tweet 1:
RT @ashish_vaidya1: @Bhupendrapbjp sir, keshod mamlatdar office says they didnt receive grant for covid19 ex-gratia claim from govt. Many c…

Tweet 2:
RT @TheAngryEpi: Informally reached out to a few epidemiologists about an open position in Arizona. Their response: “I would not work in AZ…

Tweet 3:
RT @sinovic: If this is a joke from the @DMRegister it really isn’t funny. If it’s serious, they are completely out of touch from reality w…

Tweet 4:
RT @MicahPollak: Well, #COVID19 is once again the leading cause of death in #Indiana (based on average daily deaths) and closing in on (aga…



In [135]:
def clean_tweet(text):
    # Remove "RT @username: " at the beginning
    text = re.sub(r'^RT @\w+: ', '', text)
    # Remove all other @username mentions
    text = re.sub(r'@\w+', '', text)
    # Remove trailing ellipsis
    text = re.sub(r'…$', '', text)
    # Remove any characters that are not letters, numbers, or spaces
    text = re.sub(r'[^\w\s]', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Remove any leading or trailing whitespace
    return text.strip()

In [136]:
jungyu_df['text'] = jungyu_df['text'].apply(clean_tweet)

In [137]:
for i, text in enumerate(jungyu_df['text']):
    print(f'Tweet {i+1}:\n{text}\n')

Tweet 1:
sir keshod mamlatdar office says they didnt receive grant for covid19 exgratia claim from govt Many c

Tweet 2:
Informally reached out to a few epidemiologists about an open position in Arizona Their response I would not work in AZ

Tweet 3:
If this is a joke from the it really isnt funny If its serious they are completely out of touch from reality w

Tweet 4:
Well COVID19 is once again the leading cause of death in Indiana based on average daily deaths and closing in on aga



In [138]:
'''
Data Augmentation
a. Tokenize the cleaned tweet.
'''
def tokenize_tweet(text):
    tokens = text.split()
    return tokens

In [139]:
jungyu_df['tokens'] = jungyu_df['text'].apply(tokenize_tweet)

for i, tokens in enumerate(jungyu_df['tokens']):
    print(f'Tweet {i+1} tokens:\n{tokens}\n')

Tweet 1 tokens:
['sir', 'keshod', 'mamlatdar', 'office', 'says', 'they', 'didnt', 'receive', 'grant', 'for', 'covid19', 'exgratia', 'claim', 'from', 'govt', 'Many', 'c']

Tweet 2 tokens:
['Informally', 'reached', 'out', 'to', 'a', 'few', 'epidemiologists', 'about', 'an', 'open', 'position', 'in', 'Arizona', 'Their', 'response', 'I', 'would', 'not', 'work', 'in', 'AZ']

Tweet 3 tokens:
['If', 'this', 'is', 'a', 'joke', 'from', 'the', 'it', 'really', 'isnt', 'funny', 'If', 'its', 'serious', 'they', 'are', 'completely', 'out', 'of', 'touch', 'from', 'reality', 'w']

Tweet 4 tokens:
['Well', 'COVID19', 'is', 'once', 'again', 'the', 'leading', 'cause', 'of', 'death', 'in', 'Indiana', 'based', 'on', 'average', 'daily', 'deaths', 'and', 'closing', 'in', 'on', 'aga']



In [140]:
'''
b. Remove stop words if you haven't done so earlier, be careful
'''
stop_words = set(stopwords.words('english'))

In [141]:
def remove_stop_words(tokens):
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words and len(word) > 1]
    return filtered_tokens

jungyu_df['filtered_tokens'] = jungyu_df['tokens'].apply(remove_stop_words)

for i, tokens in enumerate(jungyu_df['filtered_tokens']):
    print(f'Tweet {i+1} filtered tokens:\n{tokens}\n')

Tweet 1 filtered tokens:
['sir', 'keshod', 'mamlatdar', 'office', 'says', 'didnt', 'receive', 'grant', 'covid19', 'exgratia', 'claim', 'govt', 'Many']

Tweet 2 filtered tokens:
['Informally', 'reached', 'epidemiologists', 'open', 'position', 'Arizona', 'response', 'would', 'work', 'AZ']

Tweet 3 filtered tokens:
['joke', 'really', 'isnt', 'funny', 'serious', 'completely', 'touch', 'reality']

Tweet 4 filtered tokens:
['Well', 'COVID19', 'leading', 'cause', 'death', 'Indiana', 'based', 'average', 'daily', 'deaths', 'closing', 'aga']



In [142]:
'''
c. Per tweet choose four words randomly. 
d. Get synonyms of each of the words selected in step iii
e. Select the most similar synonym, and replace the original word with the synonym to create a new tweet (You should not replace the original tweet, you need to add a new copy of each tweet using the selected synonyms to your dataframe as a row and maintain the original sentiment
'''
word2vec_model_path = 'C:/Users/Public/6th/NLP and Recommender Systems/Assignment1/GoogleNews-vectors-negative300.bin'
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)

In [143]:
aug = naw.WordEmbsAug(
    model_type='word2vec',
    model_path=word2vec_model_path,
    action='substitute',
    aug_max=4,
    stopwords=stop_words
)

In [144]:
augmented_tweets = []

for i, row in jungyu_df.iterrows():
    original_tweet = ' '.join(row['filtered_tokens']) 
    
    augmented_tweet = aug.augment(original_tweet)
    
    augmented_tweet = ''.join(augmented_tweet)
    
    augmented_tweets.append({'sentiment': row['sentiment'], 'cleaned_text': augmented_tweet})

In [145]:
augmented_df = pd.DataFrame(augmented_tweets)
augmented_df

Unnamed: 0,sentiment,cleaned_text
0,positive,sir keshod mamlatdar office says didnt receive funding covid19 Rs1_lakh undisputed JK_govt Many
1,negative,Informally cwalters@pioneerpress.com epidemiologists open position Arizona response promised work Terry_Goddard
2,positive,joke really isnt funny catastrophic similarly touch Unan1mous
3,negative,Hmmmmmm COVID19 world'slargest cause death Indiana USA_Subjex_Corporation average daily deaths closure aga


In [146]:
jungyu_df['cleaned_text'] = jungyu_df['filtered_tokens'].apply(lambda tokens: ' '.join(tokens))

In [147]:
jungyu_df_concat = pd.concat([jungyu_df[['sentiment', 'cleaned_text']], augmented_df], ignore_index=True)
jungyu_df_concat

Unnamed: 0,sentiment,cleaned_text
0,positive,sir keshod mamlatdar office says didnt receive grant covid19 exgratia claim govt Many
1,negative,Informally reached epidemiologists open position Arizona response would work AZ
2,positive,joke really isnt funny serious completely touch reality
3,negative,Well COVID19 leading cause death Indiana based average daily deaths closing aga
4,positive,sir keshod mamlatdar office says didnt receive funding covid19 Rs1_lakh undisputed JK_govt Many
5,negative,Informally cwalters@pioneerpress.com epidemiologists open position Arizona response promised work Terry_Goddard
6,positive,joke really isnt funny catastrophic similarly touch Unan1mous
7,negative,Hmmmmmm COVID19 world'slargest cause death Indiana USA_Subjex_Corporation average daily deaths closure aga


In [148]:
output_filepath = 'C:/Users/Public/6th/NLP and Recommender Systems/Assignment1/jungyu_df_after_random_insertion.txt'
jungyu_df_concat.to_csv(output_filepath, index=False, sep='\t')