In [1]:
import re
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import json
import pickle

In [2]:
#####
# Fake News
#####
class FakeNewsDataset(Dataset):
    # Static constant variable
    LABEL2INDEX = {'fake': 0, 'real': 1}
    INDEX2LABEL = {0: 'fake', 1: 'real'}
    NUM_LABELS = 2
    EMOJI_PATTERN = re.compile(
    "["
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251"
    "]+")

    def preprocess_tweet(self, text):
        text = re.sub(r'(https://\S+)', '<URL>', text)
        #     text = text.replace('THREAD: ', '')
        text = self.EMOJI_PATTERN.sub(r'', text)
        encoded_string = text.encode("ascii", "ignore")
        text = encoded_string.decode()
        # text = text.replace('#', '')
        text = text.replace('&amp;', '&')

        return text
    
    def load_dataset(self, path): 
        df = pd.read_csv(path, sep='\t')
        if 'label' in df:
            df['label'] = df['label'].apply(lambda x: self.LABEL2INDEX[x])
#         if self.is_test:
#             df = pd.DataFrame(columns=['id', 'tweet'])
#             with open(path) as reader:
#                 for l in reader.readlines()[1:]:
#                     id, txt = l.split('\t')
#                     if self.process:
#                         df = df.append({'id': id, 'tweet':self.preprocess_tweet(txt.strip())}, ignore_index=True)
#                     else:
#                         df = df.append({'id': id, 'tweet':txt.strip()}, ignore_index=True)
#         else:
#             df = pd.DataFrame(columns=['id', 'tweet', 'label'])
#             with open(path) as reader:
#                 for l in reader.readlines()[1:]:
#                     id, txt, label = l.split('\t')
#                     if self.process:
#                         df = df.append({'id': id, 'tweet':self.preprocess_tweet(txt.strip()), 'label':self.LABEL2INDEX[label.strip()]}, ignore_index=True)
#                     else:
#                         df = df.append({'id': id, 'tweet':txt.strip(), 'label':self.LABEL2INDEX[label.strip()]}, ignore_index=True)
                
        return df
    
    def __init__(self, tokenizer, dataset_path=None, dataset=None, no_special_token=False, is_test=False, process=False, *args, **kwargs):
        self.is_test = is_test
        self.process = process
        if dataset is not None:
            self.data = dataset
        else:
            self.data = self.load_dataset(dataset_path)
        self.tokenizer = tokenizer
        self.no_special_token = no_special_token
        
    
    def __getitem__(self, index):
        data = self.data.loc[index,:]
        if self.is_test:
            id, text = index, data['tweet']
            subwords = self.tokenizer.encode(text, add_special_tokens=not self.no_special_token)
            return id, np.array(subwords), data['tweet']
        else:
            id, text, label = index, data['tweet'], data['label']
            subwords = self.tokenizer.encode(text, add_special_tokens=not self.no_special_token)
            return id, np.array(subwords), np.array(label), data['tweet']
    
    def __len__(self):
        return len(self.data)    
        
class FakeNewsDataLoader(DataLoader):
    def __init__(self, max_seq_len=512, is_test=False, *args, **kwargs):
        super(FakeNewsDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = self._collate_fn
        self.max_seq_len = max_seq_len
        self.is_test = is_test
        
    def _collate_fn(self, batch):
        batch_size = len(batch)
        max_seq_len = max(map(lambda x: len(x[1]), batch))
        max_seq_len = min(self.max_seq_len, max_seq_len)
        
        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        if not self.is_test:
            label_batch = np.zeros((batch_size, 1), dtype=np.int64)
        
        ids = []
        seq_list = []        
        if self.is_test:
            for i, (id, subwords, raw_seq) in enumerate(batch):
                ids.append(id)
                subwords = subwords[:max_seq_len]
                subword_batch[i,:len(subwords)] = subwords
                mask_batch[i,:len(subwords)] = 1
                seq_list.append(raw_seq)

            return ids, subword_batch, mask_batch, seq_list
        else:
            for i, (id, subwords, sentiment, raw_seq) in enumerate(batch):
                ids.append(id)
                subwords = subwords[:max_seq_len]
                subword_batch[i,:len(subwords)] = subwords
                mask_batch[i,:len(subwords)] = 1
                label_batch[i,0] = sentiment

                seq_list.append(raw_seq)

            return ids, subword_batch, mask_batch, label_batch, seq_list

In [3]:
indices = pickle.load(open('./tmp/index_percent_list_all_old.pkl', 'rb'))
for k, v in indices.items():
    indices[k] = v + 1 # Increment index to get ID
idx_99 = indices['0.99'].tolist()

In [None]:
indices = pickle.load(open('./tmp/index_percent_list_all_old.pkl', 'rb'))
for k, v in indices.items():
    indices[k] = v + 1 # Increment index to get ID
idx_99 = indices['0.99'].tolist()

In [4]:
idx_2_rand = [195, 3956, 2848, 4539, 5338, 885, 1524, 584, 3760, 1264, 359, 2630, 1493, 5419, 3402, 4038, 4139, 5896, 6176, 1390, 5526, 478, 1083, 1848, 4960, 3267, 5320, 2236, 4939, 3056, 5488, 618, 2804, 6007, 3726, 1796, 3740, 3357, 2980, 83, 198, 508, 2526, 3429, 5648, 58, 5594, 3689, 2703, 507, 3626, 3991, 5152, 1580, 2214, 2322, 3611, 2706, 834, 1692, 5383, 3494, 4748, 655, 4881, 120, 472, 747, 4811, 2413, 4212, 2065, 680, 5934, 1356, 3363, 612, 5576, 1851, 19, 4666, 1395, 3403, 5228, 5808, 1898, 4701, 1486, 3196, 1090, 3690, 2732, 1906, 366, 5694, 4674, 1752, 3559, 3438, 4661, 5028, 2024, 2818, 672, 232, 1006, 3703, 411, 2733, 3927, 604, 2189, 4277, 2426, 3780, 1969, 4607, 573, 1620, 4034, 4477, 77, 3964, 5482, 6381]
idx_5_rand = [2244, 3158, 3868, 2832, 4122, 99, 6133, 2606, 2169, 5609, 1286, 5071, 5402, 1457 , 6387, 4565, 5707, 5686, 1155, 5225, 4998, 695, 4014, 2816, 4825, 1319, 895, 940 , 658, 4182, 204, 2706, 5820, 6239, 5506, 5915, 2466, 1162, 1297, 1633, 5906, 3851 , 6031, 5251, 5188, 2445, 4379, 2047, 2774, 509, 1414, 5776, 2076, 3442, 5722, 804 , 3736, 6416, 5026, 455, 6210, 1056, 6271, 3984, 5746, 6257, 4759, 3243, 1338, 1862 , 2317, 6229, 1839, 1761, 1890, 5236, 2880, 3159, 2283, 6360, 1647, 6191, 2180, 914 , 5025, 1824, 1424, 4451, 153, 5620, 4588, 5341, 295, 4835, 4404, 1137, 5034, 3096 , 5494, 616, 2519, 4526, 2098, 4773, 3958, 1807, 4791, 2686, 5877, 5193, 3481, 4553 , 2620, 5586, 5513, 6368, 4763, 240, 4853, 5721, 1946, 1677, 3622, 4222, 724, 4118 , 3106, 3416, 2839, 4502, 2294, 1908, 4620, 3372, 2522, 5566, 3585, 5202, 1786, 1105 , 4301, 6174, 1374, 3364, 6171, 116, 4181, 3565, 290, 4534, 1412, 2367, 4641, 6168 , 2602, 6009, 3675, 4591, 560, 38, 530, 4004, 5652, 3817, 1750, 1114, 4927, 2116 , 1092, 2130, 3914, 5731, 188, 6330, 6334, 457, 1216, 3600, 1208, 5072, 6269, 2729 , 2624, 1340, 1235, 832, 481, 4598, 2748, 1010, 1770, 2149, 1159, 463, 4728, 4947 , 2813, 5714, 382, 5051, 4472, 3707, 5114, 5647, 2874, 706, 2394, 5354, 5462, 441 , 1573, 919, 196, 4395, 3380, 911, 624, 3044, 1526, 3008, 1239, 852, 2072, 1599 , 1233, 2320, 3587, 6108, 4686, 4909, 2705, 5102, 6164, 2775, 6035, 2984, 1804, 4975 , 1453, 4512, 5789, 524, 5155, 3056, 1552, 3976, 4375, 5115, 1583, 5049, 941, 3760 , 5682, 512, 1516, 3388, 2357, 4538, 2426, 5298, 4577, 310, 6162, 469, 3135, 5004 , 3452, 2315, 2161, 3646, 1283, 165, 5235, 3058, 3854, 4770, 151, 3696, 3035, 6234 , 5218, 2203, 2408, 1910, 3877, 6032, 2194, 2494, 3552, 1256, 1521, 2770, 3936, 5784 , 1806, 1749, 1502, 3796, 219, 1611, 2727, 4227, 5252, 3987, 5641, 1000, 4722, 4255 , 5662, 183, 6127, 5815, 6088, 2087]

In [5]:
dataset_path = './data/train.tsv'
tokenizer = AutoTokenizer.from_pretrained('roberta-large')
dataset = FakeNewsDataset(dataset_path=dataset_path, tokenizer=tokenizer, lowercase=False)

In [6]:
df = dataset.data
df.shape

(6299, 3)

In [7]:
df = dataset.data
json.dump(df.loc[df['id'].isin(idx_2_rand),:].to_dict(orient='instance'), open('clean_2%_rand.json', 'w'))

In [8]:
df = dataset.data
json.dump(df.loc[df['id'].isin(idx_5_rand),:].to_dict(orient='instance'), open('clean_5%_rand.json', 'w'))

In [9]:
df = dataset.data
json.dump(df.loc[~df.index.isin(idx_99),:].to_dict(orient='instance'), open('clean_99%.json', 'w'))

In [13]:
id2 = df.loc[df['id'].isin(idx_2_rand),'id'].tolist()
id5 = df.loc[df['id'].isin(idx_5_rand),'id'].tolist()

In [14]:
ids = df.loc[~df.index.isin(idx_99),'id'].tolist()

In [15]:
c = 0
for i in range(len(ids)-1):
    if ids[i+1] == ids[i] + 1:
        c+= 1
c, len(ids)

(0, 64)

In [18]:
len(set(ids)), len(set(id5)), len(set(id2)), len(set(ids) - set(id5)), len(set(ids) - set(id2)), len(set(id5) - set(id2)), len(set(id2) - set(id5))

(64, 314, 125, 62, 62, 310, 121)

In [10]:
dataset_path = './data/valid.tsv'
tokenizer = AutoTokenizer.from_pretrained('roberta-large')
dataset = FakeNewsDataset(dataset_path=dataset_path, tokenizer=tokenizer, lowercase=False)

In [11]:
df = dataset.data
df.shape

(2140, 3)

In [12]:
dataset_path = './data/test.tsv'
tokenizer = AutoTokenizer.from_pretrained('roberta-large')
dataset = FakeNewsDataset(dataset_path=dataset_path, tokenizer=tokenizer, lowercase=False, is_test=True)

In [13]:
df = dataset.data
df.shape

(2140, 2)