In [1]:
import pandas as pd
import csv
import string

In [50]:
train_data=pd.read_csv('olid-training-v1.0.tsv', delimiter='\t', encoding='utf-8')

train_tweets = train_data[['tweet']] #Extract tweets
train_task_a_labels= train_data[['subtask_a']] #Extract subtsak_a labels
train_task_b_labels= train_data[['subtask_b']] #Extract subtsak_b labels
train_task_c_labels= train_data[['subtask_c']] #Extract subtsak_c labels

train_task_a_labels.columns.values[0] = 'class_a' #Rename class attribute
train_task_b_labels.columns.values[0] = 'class_b' #Rename class attribute
train_task_c_labels.columns.values[0] = 'class_c' #Rename class attribute

print(train_data)

          id                                              tweet subtask_a  \
0      86426  @USER She should ask a few native Americans wh...       OFF   
1      90194  @USER @USER Go home you’re drunk!!! @USER #MAG...       OFF   
2      16820  Amazon is investigating Chinese employees who ...       NOT   
3      62688  @USER Someone should'veTaken" this piece of sh...       OFF   
4      43605  @USER @USER Obama wanted liberals &amp; illega...       NOT   
...      ...                                                ...       ...   
13235  95338  @USER Sometimes I get strong vibes from people...       OFF   
13236  67210  Benidorm ✅  Creamfields ✅  Maga ✅   Not too sh...       NOT   
13237  82921  @USER And why report this garbage.  We don't g...       OFF   
13238  27429                                        @USER Pussy       OFF   
13239  46552  #Spanishrevenge vs. #justice #HumanRights and ...       NOT   

      subtask_b subtask_c  
0           UNT       NaN  
1           TIN    

In [51]:
#Function to clean tweets in a data frame's tweet column
def clean_tweets(df):
    
    punctuations = string.punctuation
    
    df.loc[:, 'tweet'] = df.tweet.str.replace('@USER', '') #Remove mentions (@USER)
    df.loc[:, 'tweet'] = df.tweet.str.replace('URL', '') #Remove URLs
    df.loc[:, 'tweet'] = df.tweet.str.replace('&amp', 'and') #Replace ampersand (&) with and
    df.loc[:, 'tweet'] = df.tweet.str.replace('&lt','') #Remove &lt
    df.loc[:, 'tweet'] = df.tweet.str.replace('&gt','') #Remove &gt
    df.loc[:, 'tweet'] = df.tweet.str.replace('\d+','') #Remove numbers

    #Remove punctuations
    for punctuation in punctuations:
        df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')

    df.loc[:, 'tweet'] = df.astype(str).apply(
        lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii')
    ) #Remove emojis
    df.loc[:, 'tweet'] = df.tweet.str.strip() #Trim leading and trailing whitespaces

In [52]:
clean_tweets(train_tweets)

In [5]:
train_task_a_data = train_tweets.join(train_task_a_labels)

train_task_b_data = train_tweets.join(train_task_b_labels)
train_task_b_data = train_task_b_data.dropna() #Drop records with missing values

train_task_c_data = train_tweets.join(train_task_c_labels)
train_task_c_data = train_task_c_data.dropna() #Drop records with missing values

#Apply quotes to cleaned tweets
train_task_a_data.update(train_task_a_data[['tweet']].applymap('\'{}\''.format))
train_task_b_data.update(train_task_b_data[['tweet']].applymap('\'{}\''.format))
train_task_c_data.update(train_task_c_data[['tweet']].applymap('\'{}\''.format))

# train_task_a_data.to_csv('olid_training_a.csv', index=None)
# train_task_b_data.to_csv('olid_training_b.csv', index=None)
# train_task_c_data.to_csv('olid_training_c.csv', index=None)

In [6]:
print(train_task_a_data)

                                                   tweet class_a
0      'She should ask a few native Americans what th...     OFF
1                      'Go home youre drunk  MAGA Trump'     OFF
2      'Amazon is investigating Chinese employees who...     NOT
3      'Someone shouldveTaken this piece of shit to a...     OFF
4      'Obama wanted liberals and illegals to move in...     NOT
...                                                  ...     ...
13235  'Sometimes I get strong vibes from people and ...     OFF
13236  'Benidorm   Creamfields   Maga    Not too shab...     NOT
13237  'And why report this garbage  We dont give a c...     OFF
13238                                            'Pussy'     OFF
13239  'Spanishrevenge vs justice HumanRights and Fre...     NOT

[13240 rows x 2 columns]


### Preparing the test sets

In [7]:
#Read tweets from test sets
test_tweet_a=pd.read_csv('testset-levela.tsv', delimiter='\t', encoding='utf-8')
test_tweet_b=pd.read_csv('testset-levelb.tsv', delimiter='\t', encoding='utf-8')
test_tweet_c=pd.read_csv('testset-levelc.tsv', delimiter='\t', encoding='utf-8')

#Read tweet labels
test_label_a=pd.read_csv('labels-levela.csv', encoding='utf-8', 
                         index_col=False, names=['id', 'class_a'])
test_label_b=pd.read_csv('labels-levelb.csv', encoding='utf-8', 
                         index_col=False, names=['id', 'class_b'])
test_label_c=pd.read_csv('labels-levelc.csv', encoding='utf-8', 
                         index_col=False, names=['id', 'class_c'])

#Merge tweets with labels by id
test_tweet_a = test_tweet_a.merge(test_label_a, on='id')
test_tweet_b = test_tweet_b.merge(test_label_b, on='id')
test_tweet_c = test_tweet_c.merge(test_label_c, on='id')

#Drop id column
test_tweet_a = test_tweet_a.drop(columns='id')
test_tweet_b = test_tweet_b.drop(columns='id')
test_tweet_c = test_tweet_c.drop(columns='id')

#Clean tweets in test sets
clean_tweets(test_tweet_a)
clean_tweets(test_tweet_b)
clean_tweets(test_tweet_c)

#Apply quotes to cleaned tweets
test_tweet_a.update(test_tweet_a[['tweet']].applymap('\'{}\''.format))
test_tweet_b.update(test_tweet_b[['tweet']].applymap('\'{}\''.format))
test_tweet_c.update(test_tweet_c[['tweet']].applymap('\'{}\''.format))


#Export to csv file
# test_tweet_a.to_csv('olid_test_a.csv', index=None,header=True)
# test_tweet_b.to_csv('olid_test_b.csv', index=None, header=True)
# test_tweet_c.to_csv('olid_test_c.csv', index=None, header=True)

print(test_tweet_a.head())

                                               tweet class_a
0  'WhoIsQ WheresTheServer DumpNike DECLASFISA De...     OFF
1  'ConstitutionDay is revered by Conservatives h...     NOT
2  'FOXNews NRA MAGA POTUS TRUMP ndAmendment RNC ...     NOT
3  'Watching Boomer getting the news that she is ...     NOT
4  'NoPasaran Unity demo to oppose the farright i...     OFF


In [8]:
print(test_tweet_a)

                                                 tweet class_a
0    'WhoIsQ WheresTheServer DumpNike DECLASFISA De...     OFF
1    'ConstitutionDay is revered by Conservatives h...     NOT
2    'FOXNews NRA MAGA POTUS TRUMP ndAmendment RNC ...     NOT
3    'Watching Boomer getting the news that she is ...     NOT
4    'NoPasaran Unity demo to oppose the farright i...     OFF
..                                                 ...     ...
855  'DespicableDems lie again about rifles Dem Dis...     OFF
856  'MeetTheSpeakers   will present in our event O...     NOT
857  'people just unfollowed me for talking about m...     OFF
858  'WednesdayWisdom Antifa calls the right fascis...     NOT
859            'Kavanaugh typical liberals  Democrats'     NOT

[860 rows x 2 columns]
