In [1]:
# Insert code here.
import pandas as pd
import numpy as np
import random
import re
import time
import datetime
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, neighbors
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, BertConfig, AutoModel
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
from tqdm import tqdm
# from sentence_transformers import SentenceTransformer
# sent_encoder = SentenceTransformer('bert-base-nli-mean-tokens')

In [2]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
torch.cuda.empty_cache()

There are 4 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti


In [3]:

labels = ['fake','hate', 'defamation','offensive','non-hostile']
lab_num = 4
EPOCH_NUM = 0
lab = labels[lab_num]
epoch_name = '../temp/finetuned/'+lab+'_epoch_'+str(EPOCH_NUM)

In [4]:
models = ['ai4bharat/indic-bert', 'distilbert-base-uncased-finetuned-sst-2-english', 'textattack/roberta-base-SST-2','roberta-base', 'google/electra-base-discriminator', 'xlnet-base-cased', 'xlm-roberta-base', '/scratch/indic-tapt','/scratch/indic-tapt2', '/scratch/indic-tapt/checkpoint-500']
model_num = 0
tokenizer = AutoTokenizer.from_pretrained(models[model_num])
src = '../temp/preprocessed/'

In [5]:
# train = pd.read_csv('../datasets/covid/Constraint_English_Train - Sheet1.csv')
# test = pd.read_csv('../datasets/covid/Constraint_English_Val - Sheet1.csv')

In [6]:
import pickle
with open(src+'train.pickle','rb') as f:
    train = pickle.load(f)
    train = pd.DataFrame.from_dict(train)
    train.drop(train.head(1).index, inplace=True)
with open(src+'valid.pickle','rb') as f:
    valid = pickle.load(f)
    valid = pd.DataFrame.from_dict(valid)
    valid.drop(valid.head(1).index, inplace=True)
with open(src+'test.pickle','rb') as f:
    test = pickle.load(f)
#     del test['task_1']
    test = pd.DataFrame.from_dict(test)
#     test.drop(test.head(1).index, inplace=True)
#     test = pd.DataFrame.from_dict(test)
# test = pd.read_csv('data/valid.tsv', sep='\t')

In [7]:
train = pd.concat([train, valid])
test.head(10)

Unnamed: 0,tweet_id,full_tweet,tweet_raw_text,hashtags,smiley,emoji,url,mentions,numerals,reserved_word,emotext,segmented_hash,clean
0,1,‡§ï‡•Ä‡§∏ ‡§ï‡•Ä ‡§ï‡•ã ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§ö‡§æ‡§π‡§ø‡§è ‡§´‡§ø‡§∞ ‡§®‡§π‡•Ä‡§Ç ‡§ï‡§π‡§®‡§æ ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§®‡§π...,‡§ï‡•Ä‡§∏ ‡§ï‡•Ä ‡§ï‡•ã ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§ö‡§æ‡§π‡§ø‡§è ‡§´‡§ø‡§∞ ‡§®‡§π‡•Ä‡§Ç ‡§ï‡§π‡§®‡§æ ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§®‡§π...,[],[],[],[],[],"[20, 6, 10, 20, 6]",[],[],[],‡§ï‡•Ä‡§∏ ‡§ï‡•Ä ‡§ï‡•ã ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§ö‡§æ‡§π‡§ø‡§è ‡§´‡§ø‡§∞ ‡§®‡§π‡•Ä‡§Ç ‡§ï‡§π‡§®‡§æ ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§®‡§π...
1,2,‡§™‡§ü‡§®‡§æ: BMP ‡§ï‡•à‡§Ç‡§™ ‡§Æ‡•á‡§Ç ‡§™‡•Å‡§∞‡•Å‡§∑ ‡§î‡§∞ ‡§Æ‡§π‡§ø‡§≤‡§æ ‡§ï‡§æ‡§Ç‡§∏‡•ç‡§ü‡•á‡§¨‡§≤ ‡§®‡•á...,‡§™‡§ü‡§®‡§æ : BMP ‡§ï‡•à‡§Ç‡§™ ‡§Æ‡•á‡§Ç ‡§™‡•Å‡§∞‡•Å‡§∑ ‡§î‡§∞ ‡§Æ‡§π‡§ø‡§≤‡§æ ‡§ï‡§æ‡§Ç‡§∏‡•ç‡§ü‡•á‡§¨‡§≤ ...,[],[],[],[https://t.co/Dq05hREifM],[@kumarprakash4u],[],[],[],[],‡§™‡§ü‡§®‡§æ: BMP ‡§ï‡•à‡§Ç‡§™ ‡§Æ‡•á‡§Ç ‡§™‡•Å‡§∞‡•Å‡§∑ ‡§î‡§∞ ‡§Æ‡§π‡§ø‡§≤‡§æ ‡§ï‡§æ‡§Ç‡§∏‡•ç‡§ü‡•á‡§¨‡§≤ ‡§®‡•á...
2,3,"‡§ï‡•ã‡§à ‡§≠‡•Ä ‡§ï‡§æ‡§Ç‡§ó‡•ç‡§∞‡•á‡§∏‡•Ä, ‡§ä‡§Ç‡§ö‡•Ä ‡§õ‡§§ ‡§™‡§∞, ‡§∞‡•á‡§≤‡§µ‡•á ‡§≤‡§æ‡§á‡§® ‡§™‡§∞, ‡§ä...","‡§ï‡•ã‡§à ‡§≠‡•Ä ‡§ï‡§æ‡§Ç‡§ó‡•ç‡§∞‡•á‡§∏‡•Ä , ‡§ä‡§Ç‡§ö‡•Ä ‡§õ‡§§ ‡§™‡§∞ , ‡§∞‡•á‡§≤‡§µ‡•á ‡§≤‡§æ‡§á‡§® ‡§™...",[],[],"[üôè, üòÇ, üëç]",[],[],[],[],"[folded hands, face with tears of joy, thumbs up]",[],"‡§ï‡•ã‡§à ‡§≠‡•Ä ‡§ï‡§æ‡§Ç‡§ó‡•ç‡§∞‡•á‡§∏‡•Ä, ‡§ä‡§Ç‡§ö‡•Ä ‡§õ‡§§ ‡§™‡§∞, ‡§∞‡•á‡§≤‡§µ‡•á ‡§≤‡§æ‡§á‡§® ‡§™‡§∞, ‡§ä..."
3,4,‡§Ö‡§Ç‡§°‡§∞‡§µ‡§∞‡•ç‡§≤‡•ç‡§° ‡§°‡•â‡§® ‡§õ‡•ã‡§ü‡§æ ‡§∞‡§æ‡§ú‡§® ‡§ï‡•á ‡§≠‡§æ‡§à ‡§ï‡•ã ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§¶‡•ç‡§µ‡§æ...,‡§Ö‡§Ç‡§°‡§∞‡§µ‡§∞‡•ç‡§≤‡•ç‡§° ‡§°‡•â‡§® ‡§õ‡•ã‡§ü‡§æ ‡§∞‡§æ‡§ú‡§® ‡§ï‡•á ‡§≠‡§æ‡§à ‡§ï‡•ã ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§¶‡•ç‡§µ‡§æ...,[],[],[],[],[],[],[],[],[],‡§Ö‡§Ç‡§°‡§∞‡§µ‡§∞‡•ç‡§≤‡•ç‡§° ‡§°‡•â‡§® ‡§õ‡•ã‡§ü‡§æ ‡§∞‡§æ‡§ú‡§® ‡§ï‡•á ‡§≠‡§æ‡§à ‡§ï‡•ã ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§¶‡•ç‡§µ‡§æ...
4,5,RT @_Pb_swain_: ‡§á‡§® ‡§™‡§Ç‡§ö‡§∞ ‡§õ‡§æ‡§™‡•ã‡§Ç ‡§ï‡•ã ‡§ï‡•ã‡§® ‡§∏‡§Æ‡§ù‡§æ‡§è ‡§ï‡§ø ...,: ‡§á‡§® ‡§™‡§Ç‡§ö‡§∞ ‡§õ‡§æ‡§™‡•ã‡§Ç ‡§ï‡•ã ‡§ï‡•ã‡§® ‡§∏‡§Æ‡§ù‡§æ‡§è ‡§ï‡§ø ‡§â‡§®‡§ï‡•á ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§Æ...,[],[],"[üëá, üòÇ, üòÇ, üòÇ, üòÇ]",[],[@_Pb_swain_],[],[RT],"[backhand index pointing down, face with tears...",[],RT : ‡§á‡§® ‡§™‡§Ç‡§ö‡§∞ ‡§õ‡§æ‡§™‡•ã‡§Ç ‡§ï‡•ã ‡§ï‡•ã‡§® ‡§∏‡§Æ‡§ù‡§æ‡§è ‡§ï‡§ø ‡§â‡§®‡§ï‡•á ‡§∞‡•ã‡§ú‡§ó‡§æ...
5,6,"‡§™‡§∂‡•ç‡§ö‡§ø‡§Æ ‡§¨‡§Ç‡§ó‡§æ‡§≤ ‡§Æ‡•á‡§Ç ‡§´‡§ø‡§∞ ‡§π‡•Å‡§à ‡§Æ‡§æ‡§ì‡§µ‡§æ‡§¶‡§ø‡§Ø‡•ã‡§Ç ‡§ï‡•Ä ‡§µ‡§æ‡§™‡§∏‡•Ä, ...","‡§™‡§∂‡•ç‡§ö‡§ø‡§Æ ‡§¨‡§Ç‡§ó‡§æ‡§≤ ‡§Æ‡•á‡§Ç ‡§´‡§ø‡§∞ ‡§π‡•Å‡§à ‡§Æ‡§æ‡§ì‡§µ‡§æ‡§¶‡§ø‡§Ø‡•ã‡§Ç ‡§ï‡•Ä ‡§µ‡§æ‡§™‡§∏‡•Ä ,...","[#Maoist, #WestBengal]",[],[],[https://t.co/pP1AOvOv0b],[],[],[],[],"[maoist, west bengal]","‡§™‡§∂‡•ç‡§ö‡§ø‡§Æ ‡§¨‡§Ç‡§ó‡§æ‡§≤ ‡§Æ‡•á‡§Ç ‡§´‡§ø‡§∞ ‡§π‡•Å‡§à ‡§Æ‡§æ‡§ì‡§µ‡§æ‡§¶‡§ø‡§Ø‡•ã‡§Ç ‡§ï‡•Ä ‡§µ‡§æ‡§™‡§∏‡•Ä, ..."
6,7,#Breaking-‡§ï‡§Ç‡§ó‡§®‡§æ ‡§Æ‡§æ‡§Æ‡§≤‡•á ‡§™‡§∞ ‡§¨‡•ã‡§≤‡•á ‡§Æ‡§®‡•ã‡§ú ‡§§‡§ø‡§µ‡§æ‡§∞‡•Ä-‡§ï‡§π‡§æ ...,#Breaking-‡§ï‡§Ç‡§ó‡§®‡§æ ‡§Æ‡§æ‡§Æ‡§≤‡•á ‡§™‡§∞ ‡§¨‡•ã‡§≤‡•á ‡§Æ‡§®‡•ã‡§ú ‡§§‡§ø‡§µ‡§æ‡§∞‡•Ä-‡§ï‡§π‡§æ ...,"[#Breaking, #Sushantsinghcase, #Kangana]",[],[],[https://t.co/szOTZWq1hI],[],[],[],[],"[breaking, sushantsinghcase, kangana]",-‡§ï‡§Ç‡§ó‡§®‡§æ ‡§Æ‡§æ‡§Æ‡§≤‡•á ‡§™‡§∞ ‡§¨‡•ã‡§≤‡•á ‡§Æ‡§®‡•ã‡§ú ‡§§‡§ø‡§µ‡§æ‡§∞‡•Ä-‡§ï‡§π‡§æ ‡§ß‡§Æ‡§ï‡•Ä ‡§Æ‡§ø‡§≤...
7,8,@BasudebaTripat4: @Rajanspsingh1 ‡§Ö‡§ö‡•ç‡§õ‡§æ ‡§ï‡§ø‡§Ø‡§æ ‡§∏‡§æ...,": ‡§Ö‡§ö‡•ç‡§õ‡§æ ‡§ï‡§ø‡§Ø‡§æ ‡§∏‡§æ‡§≤‡•á ‡§ï‡§æ ‡§∏‡§∞ ‡§´‡•ã‡§°‡§º ‡§¶‡§ø‡§Ø‡§æ , , ‡§ó‡§∞‡•ç‡§¶‡§®...",[],[],[],[],"[@BasudebaTripat4, @Rajanspsingh1]",[],[],[],[],": ‡§Ö‡§ö‡•ç‡§õ‡§æ ‡§ï‡§ø‡§Ø‡§æ ‡§∏‡§æ‡§≤‡•á ‡§ï‡§æ ‡§∏‡§∞ ‡§´‡•ã‡§°‡§º ‡§¶‡§ø‡§Ø‡§æ,, ‡§ó‡§∞‡•ç‡§¶‡§® ‡§§..."
8,9,‡§π‡•à‡§¶‡§∞‡§æ‡§¨‡§æ‡§¶ ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§µ‡§ø‡§ß‡§æ‡§Ø‡§ï ‡§∞‡§æ‡§ú‡§æ ‡§∏‡§ø‡§Ç‡§π ‡§ï‡•Ä ‡§¨‡§π‡§® ‡§Æ‡§æ‡§Ø‡§æ ‡§¶...,‡§π‡•à‡§¶‡§∞‡§æ‡§¨‡§æ‡§¶ ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§µ‡§ø‡§ß‡§æ‡§Ø‡§ï ‡§∞‡§æ‡§ú‡§æ ‡§∏‡§ø‡§Ç‡§π ‡§ï‡•Ä ‡§¨‡§π‡§® ‡§Æ‡§æ‡§Ø‡§æ ‡§¶...,[],[],[],[],[],[],[],[],[],‡§π‡•à‡§¶‡§∞‡§æ‡§¨‡§æ‡§¶ ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§µ‡§ø‡§ß‡§æ‡§Ø‡§ï ‡§∞‡§æ‡§ú‡§æ ‡§∏‡§ø‡§Ç‡§π ‡§ï‡•Ä ‡§¨‡§π‡§® ‡§Æ‡§æ‡§Ø‡§æ ‡§¶...
9,10,"‡§ï‡§Æ‡§≤‡§®‡§æ‡§• ‡§ï‡•á ‡§∞‡§æ‡§ú ‡§Æ‡•á‡§Ç 100,‚Çπ ‡§Æ‡•á‡§Ç 100‡§Ø‡•Ç‡§®‡§ø‡§ü ‡§¨‡§ø‡§ú‡§≤‡•Ä ‡§Æ‡§ø‡§≤...","‡§ï‡§Æ‡§≤‡§®‡§æ‡§• ‡§ï‡•á ‡§∞‡§æ‡§ú ‡§Æ‡•á‡§Ç , ‚Çπ ‡§Æ‡•á‡§Ç 100‡§Ø‡•Ç‡§®‡§ø‡§ü ‡§¨‡§ø‡§ú‡§≤‡•Ä ‡§Æ‡§ø‡§≤ ‡§∞...",[],[],[],[],[],"[100, 100, 500, 1000]",[],[],[],"‡§ï‡§Æ‡§≤‡§®‡§æ‡§• ‡§ï‡•á ‡§∞‡§æ‡§ú ‡§Æ‡•á‡§Ç 100,‚Çπ ‡§Æ‡•á‡§Ç 100‡§Ø‡•Ç‡§®‡§ø‡§ü ‡§¨‡§ø‡§ú‡§≤‡•Ä ‡§Æ‡§ø‡§≤..."


In [8]:
lab = labels[lab_num]
# def label_encode(val):
#     return labels.index(val)
def label_encode(val):
    val = val.split(',')
    if lab_num == 4:
        if lab in val:
            return 0
        else:
            return 1
    else:
        if lab in val:
            return 1
        else:
            return 0

In [9]:
train['label'] = train.task_1.apply(label_encode)
train['tweet'] = train.full_tweet
test['tweet'] = test.full_tweet

In [10]:
train.label.sample(10)

4172    0
148     1
3952    0
3643    0
866     1
2338    1
969     1
2820    1
5159    1
2827    0
Name: label, dtype: int64

In [11]:
train = train.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = []

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
# train.tweet = train.tweet.apply(clean_text)
# train.tweet = train.tweet.str.replace('\d+', '')

In [12]:
# test.label = test.label.apply(label_encode)
test = test.reset_index(drop=True)
# test.tweet = test.tweet.apply(clean_text)
# test.tweet = test.tweet.str.replace('\d+', '')

In [13]:
train.tweet.sample(10)

4979    ‡§¶‡•á‡§∂ ‡§®‡•á ‡§∞‡§ï‡•ç‡§∑‡§æ ‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞ ‡§Æ‡•á‡§Ç ‡§¨‡§°‡§º‡•Ä ‡§∏‡§´‡§≤‡§§‡§æ ‡§π‡§æ‡§∏‡§ø‡§≤ ‡§ï‡•Ä ‡§π...
3146    ‡§ï‡§Ç‡§ó‡§®‡§æ ‡§∞‡§®‡•å‡§§ ‡§î‡§∞ ‡§∂‡§ø‡§µ ‡§∏‡•á‡§®‡§æ ‡§ï‡•á ‡§¨‡•Ä‡§ö ‡§ú‡•Å‡§¨‡§æ‡§®‡•Ä ‡§ú‡§Ç‡§ó ‡•§ ‡§ï‡§Ç‡§ó...
4966    ‡§Ö‡§¨ ‡§§‡§ï ‡§∞‡§æ‡§ú‡•ç‡§Ø ‡§Æ‡•á‡§Ç 41.07 ‡§≤‡§æ‡§ñ ‡§®‡§Æ‡•Ç‡§®‡•ã‡§Ç ‡§ï‡•Ä ‡§ú‡§æ‡§Ç‡§ö ‡§ï‡•Ä ‡§ú‡§æ...
733     #RescueIndiaFromBJP ‡§ï‡§æ‡§Ç‡§ó‡•ç‡§∞‡•á‡§∏ ‡§ï‡•á ‡§ï‡•Å‡§§‡•ç‡§§‡•ã‡§Ç ‡§®‡•á ‡§è‡§ï ...
4642    BIHAR | ‡§ï‡•ã‡§∞‡•ã‡§®‡§æ ‡§Æ‡§π‡§æ‡§Æ‡§æ‡§∞‡•Ä ‡§ï‡•á ‡§¨‡•Ä‡§ö ‡§¨‡§ø‡§π‡§æ‡§∞ ‡§Æ‡•á‡§Ç 97 DSP...
742     ‡§ú‡•ç‡§Ø‡§æ‡§¶‡§æ ‡§™‡§∞‡•á‡§∂‡§æ‡§® ‡§®‡§æ ‡§π‡•ã‡§á‡§è ‡§´‡•ç‡§∞‡•Ä ‡§Æ‡•á‡§Ç ‡§Ö‡§ï‡§æ‡§â‡§Ç‡§ü ‡§Æ‡•á‡§Ç ‡§™‡•à‡§∏‡•á...
2       ‡§∏‡•Å‡§∂‡§æ‡§Ç‡§§ ‡§®‡•á ‡§ú‡•ã ‡§¨‡§ø‡§ú‡§®‡•á‡§∏ ‡§°‡•Ä‡§≤ 9 ‡§ú‡•Ç‡§® ‡§ï‡•ã ‡§ï‡•Ä ‡§•‡•Ä, ‡§µ‡•ã ‡§°‡•Ä‡§≤...
321     ‡§á‡§Ø‡•ã‡§® ‡§Æ‡•â‡§∞‡•ç‡§ó‡§® ‡§®‡•á ‡§â‡§Ç‡§ó‡§≤‡•Ä ‡§ü‡•Ç‡§ü‡§®‡•á ‡§ï‡•á ‡§¨‡§æ‡§µ‡§ú‡•Ç‡§¶ ‡§ñ‡•á‡§≤‡§®‡§æ ‡§ú‡§æ‡§∞...
477     ‡§∏‡•Å‡§

In [14]:
# split the dataset into training and validation datasets 
from sklearn.model_selection import train_test_split
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['tweet'], train['label'])
train_x, valid_x, train_y, valid_y = train_test_split(train['tweet'], train['label'], test_size=0.2)

In [15]:
def count_words(text):
    try:
        return len(text.split())
    except:
        print(text)
        return None

In [16]:
total = 0
maxw = 0
large_count = 0
for i in train_x:
    temp = count_words(i)
    total += temp
    maxw = temp if temp > maxw else maxw
    large_count += 1 if temp > 120 else 0
total/len(train_x), maxw, large_count, len(train_x)

(30.549323786793956, 2808, 23, 5028)

In [17]:
# MAX_LENGTH = 50
posts = train.values
categories = train.values

In [18]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 1e-05


In [19]:
import gensim.models as gsm
e2v = gsm.KeyedVectors.load_word2vec_format('emoji2vec.bin', binary=True)
# happy_vector = e2v['üòÇ']    # Produces an embedding vector of length 300

# Download the bin file from here https://github.com/uclnlp/emoji2vec/blob/master/pre-trained/emoji2vec.bin

def getEmojiEmbeddings(emojiList,dim=300,verbose = False):
  """ Generates an emoji vector by averaging the emoji representation for each emoji. If no emoji returns an empty list of dimension dim"""
  if dim < 300:
    raise IndexError("Dim has to be greater than 300")
  result = np.zeros(dim)
  if (len(emojiList) == 0):
    return result
  else:
    embs = None
    for i in emojiList:
      if verbose:
        if i not in e2v.vocab:
          print(i)
    embs = np.mean([e2v[i] for i in emojiList if i in e2v.vocab], axis=0)
  if np.any(np.isnan(embs)):
    return result
  result[:300] = embs
  return result
getEmojiEmbeddings(valid.emoji.values[0])

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [20]:
ids = tokenizer.encode_plus(
            valid.full_tweet.values[0],
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            return_attention_mask = True,
            return_token_type_ids=True
        )['input_ids']
torch.tensor(ids, dtype=torch.long).shape, torch.tensor(getEmojiEmbeddings(valid.emoji.values[0]), dtype=torch.long).shape



(torch.Size([128]), torch.Size([300]))

In [21]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len, t = False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.tweet
        self.emoji = dataframe.emoji
        self.hash = dataframe.segmented_hash
        self.t = t
        if not self.t:
            self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask = True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        h_text = self.hash[index]
        h_text = " ".join(h_text)
        inputs = self.tokenizer.encode_plus(
            h_text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask = True,
            return_token_type_ids=True
        )
        h_ids = inputs['input_ids']
        h_mask = inputs['attention_mask']
        h_token_type_ids = inputs["token_type_ids"]
#         h_inputs
        emoji = getEmojiEmbeddings(self.emoji[index])
        if self.t:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'h_ids': torch.tensor(h_ids, dtype=torch.long),
                'h_mask': torch.tensor(h_mask, dtype=torch.long),
                'h_token_type_ids': torch.tensor(h_token_type_ids, dtype=torch.long),
                'emoji' : torch.tensor(emoji, dtype=torch.long),
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'h_ids': torch.tensor(h_ids, dtype=torch.long),
                'h_mask': torch.tensor(h_mask, dtype=torch.long),
                'h_token_type_ids': torch.tensor(h_token_type_ids, dtype=torch.long),
                'emoji' : torch.tensor(emoji, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.long)
            }

In [22]:
# Creating the dataset and dataloader for the neural network

train_size = 0.85
train_data=train.sample(frac=train_size,random_state=200)
test_data=train.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(train.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (6286, 16)
TRAIN Dataset: (5343, 16)
TEST Dataset: (943, 16)


In [23]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [24]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(models[model_num])
        self.l2 = AutoModel.from_pretrained(models[model_num])
        
        self.pre_classifier_1 = torch.nn.Linear(768, 768)
        self.pre_classifier_2 = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.pre_classifier_3 = torch.nn.Linear(1836, 1836)
#         self.pre_classifier_3 = torch.nn.Linear(768, 100)
        self.classifier = torch.nn.Linear(1836, 2)

    def forward(self, input_ids, attention_mask, token_type_ids, h_ids, h_mask, h_token_type_ids, emoji):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state_1 = output_1[0]
        pooler_1 = hidden_state_1[:, 0]
        pooler_1 = self.pre_classifier_1(pooler_1)
        pooler_1 = torch.nn.Tanh()(pooler_1)
        pooler_1 = self.dropout(pooler_1)
        output_2 = self.l2(input_ids=h_ids, attention_mask=h_mask)
        hidden_state_2 = output_2[0]
        pooler_2 = hidden_state_2[:, 0]
        pooler_2 = self.pre_classifier_2(pooler_2)
        pooler_2 = torch.nn.Tanh()(pooler_2)
        pooler_2 = self.dropout(pooler_2)
        pooler_3 = torch.cat((pooler_1, pooler_2), 1)
        pooler_3 = torch.cat((pooler_3, emoji), 1)
#         print(pooler_1.shape,hidden_state_1.shape, pooler_2.shape, emoji.type(torch.FloatTensor).shape)
#         pooler_3 = torch.nn.Tanh()(emoji.type(torch.FloatTensor))
#         pooler_3 = self.dropout(pooler_3)
#         print(pooler_3.shape)
        pooler_3 = self.pre_classifier_3(pooler_3)
#         pooler_3 = self.pre_classifier_3(pooler_2)
        pooler_3 = torch.nn.Tanh()(pooler_3)
        pooler_3 = self.dropout(pooler_3)
        output = self.classifier(pooler_3)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(200000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
             

In [25]:
# from torchsummary import summary
# print(repr(model))


In [26]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))


The BERT model has 58 different named parameters.

==== Embedding Layer ====

l1.embeddings.word_embeddings.weight                    (200000, 128)
l1.embeddings.position_embeddings.weight                  (512, 128)
l1.embeddings.token_type_embeddings.weight                  (2, 128)
l1.embeddings.LayerNorm.weight                                (128,)
l1.embeddings.LayerNorm.bias                                  (128,)

==== First Transformer ====

l1.encoder.embedding_hidden_mapping_in.weight             (768, 128)
l1.encoder.embedding_hidden_mapping_in.bias                   (768,)
l1.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight       (768,)
l1.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.bias       (768,)
l1.encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight   (768, 768)
l1.encoder.albert_layer_groups.0.albert_layers.0.attention.query.bias       (768,)
l1.encoder.albert_layer_groups.0.albert_layers.0.attention.k

In [27]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

In [28]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [29]:
def train(epoch):
    total_train_loss = 0
    count = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        h_ids = data['h_ids'].to(device, dtype = torch.long)
        h_mask = data['h_mask'].to(device, dtype = torch.long)
        h_token_type_ids = data['h_token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        emoji = data['emoji'].to(device, dtype = torch.long)
        outputs = model(ids, mask, token_type_ids, h_ids, h_mask, h_token_type_ids, emoji)
        optimizer.zero_grad()
#         loss = outputs.loss
        loss = loss_fn(outputs, targets)
#         if _%50==0:
#             print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        total_train_loss += loss.item()
        count += 1
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    print(f'Epoch: {epoch}, Loss:  {total_train_loss/count}')
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            h_ids = data['h_ids'].to(device, dtype = torch.long)
            h_mask = data['h_mask'].to(device, dtype = torch.long)
            h_token_type_ids = data['h_token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            emoji = data['emoji'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids, h_ids, h_mask, h_token_type_ids, emoji)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    fin_outputs = list(np.argmax(np.array(fin_outputs), axis=1).flatten())
    print(classification_report(fin_targets, fin_outputs))
    torch.save(model, '../temp/finetuned/'+lab+'_epoch_'+str(epoch))
    return fin_outputs, fin_targets
#     final_outputs = np.array(fin_outputs) >=0.5
#     final = []
#     final_t = []
#     final_fine = [[],[],[],[]]
#     final_fine_t = [[],[],[],[]]
#     for (i,j) in zip(final_outputs, fin_targets):
#         output_sum = sum(i)
#         target_sum = sum(j)
#         if output_sum == 0:
#             final.append(0)
#         else:
#             final.append(1)
#         if target_sum == 0:
#             final_t.append(0)
#         else:
#             final_t.append(1)
#         for p in range(4):
#             final_fine[p].append(int(i[p]))
#             final_fine_t[p].append(int(j[p]))
#     print("Coarse:")
#     print(classification_report(final, final_t))
#     for i in range(4):
#         print("Fine", i)
    
#     return fin_outputs, fin_targets

In [30]:
# for epoch in range(EPOCHS):
#     out, tar = train(epoch)
# #     break

In [31]:
# out[0:10], tar[0:10]hjkhnk

In [32]:
# Creating the dataset and dataloader for the neural network

# train_size = 0.8
# test_data=test.sample(frac=1,random_state=200)
# test_data=train.drop(train_data.index).reset_index(drop=True)
test_data = test.reset_index(drop=True)
testing = MultiLabelDataset(test_data, tokenizer, MAX_LEN, t=True)
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }
testing_loader = DataLoader(testing, **test_params)

In [33]:
!nvidia-smi

Sun Jan  3 18:04:43 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.95.01    Driver Version: 440.95.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:02:00.0 Off |                  N/A |
| 31%   24C    P2    45W / 250W |   6555MiB / 11019MiB |      4%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:03:00.0 Off |                  N/A |
| 29%   22C    P8    18W / 250W |     11MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:82:00.0 Off |                  N/A |
| 29%   

In [34]:

fin_targets=[]
fin_outputs=[]
# print(f'Epoch: {epoch}, Loss:  {total_train_loss/count}')
with torch.no_grad():
    model = torch.load(epoch_name, map_location=device)
    model.eval()
    for _, data in tqdm(enumerate(testing_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        h_ids = data['h_ids'].to(device, dtype = torch.long)
        h_mask = data['h_mask'].to(device, dtype = torch.long)
        h_token_type_ids = data['h_token_type_ids'].to(device, dtype = torch.long)
#         targets = data['targets'].to(device, dtype = torch.long)
        emoji = data['emoji'].to(device, dtype = torch.long)
        outputs = model(ids, mask, token_type_ids, h_ids, h_mask, h_token_type_ids, emoji)
#         fin_targets.extend(targets.cpu().detach().numpy().tolist())
        fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
fin_outputs = list(np.argmax(np.array(fin_outputs), axis=1).flatten())
# print(classification_report(fin_outputs, fin_targets))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
51it [00:08,  5.73it/s]


In [35]:
fin_outputs[0:20]

[1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1]

In [36]:
test_data.head(10)

Unnamed: 0,tweet_id,full_tweet,tweet_raw_text,hashtags,smiley,emoji,url,mentions,numerals,reserved_word,emotext,segmented_hash,clean,tweet
0,1,‡§ï‡•Ä‡§∏ ‡§ï‡•Ä ‡§ï‡•ã ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§ö‡§æ‡§π‡§ø‡§è ‡§´‡§ø‡§∞ ‡§®‡§π‡•Ä‡§Ç ‡§ï‡§π‡§®‡§æ ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§®‡§π...,‡§ï‡•Ä‡§∏ ‡§ï‡•Ä ‡§ï‡•ã ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§ö‡§æ‡§π‡§ø‡§è ‡§´‡§ø‡§∞ ‡§®‡§π‡•Ä‡§Ç ‡§ï‡§π‡§®‡§æ ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§®‡§π...,[],[],[],[],[],"[20, 6, 10, 20, 6]",[],[],[],‡§ï‡•Ä‡§∏ ‡§ï‡•Ä ‡§ï‡•ã ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§ö‡§æ‡§π‡§ø‡§è ‡§´‡§ø‡§∞ ‡§®‡§π‡•Ä‡§Ç ‡§ï‡§π‡§®‡§æ ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§®‡§π...,‡§ï‡•Ä‡§∏ ‡§ï‡•Ä ‡§ï‡•ã ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§ö‡§æ‡§π‡§ø‡§è ‡§´‡§ø‡§∞ ‡§®‡§π‡•Ä‡§Ç ‡§ï‡§π‡§®‡§æ ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§®‡§π...
1,2,‡§™‡§ü‡§®‡§æ: BMP ‡§ï‡•à‡§Ç‡§™ ‡§Æ‡•á‡§Ç ‡§™‡•Å‡§∞‡•Å‡§∑ ‡§î‡§∞ ‡§Æ‡§π‡§ø‡§≤‡§æ ‡§ï‡§æ‡§Ç‡§∏‡•ç‡§ü‡•á‡§¨‡§≤ ‡§®‡•á...,‡§™‡§ü‡§®‡§æ : BMP ‡§ï‡•à‡§Ç‡§™ ‡§Æ‡•á‡§Ç ‡§™‡•Å‡§∞‡•Å‡§∑ ‡§î‡§∞ ‡§Æ‡§π‡§ø‡§≤‡§æ ‡§ï‡§æ‡§Ç‡§∏‡•ç‡§ü‡•á‡§¨‡§≤ ...,[],[],[],[https://t.co/Dq05hREifM],[@kumarprakash4u],[],[],[],[],‡§™‡§ü‡§®‡§æ: BMP ‡§ï‡•à‡§Ç‡§™ ‡§Æ‡•á‡§Ç ‡§™‡•Å‡§∞‡•Å‡§∑ ‡§î‡§∞ ‡§Æ‡§π‡§ø‡§≤‡§æ ‡§ï‡§æ‡§Ç‡§∏‡•ç‡§ü‡•á‡§¨‡§≤ ‡§®‡•á...,‡§™‡§ü‡§®‡§æ: BMP ‡§ï‡•à‡§Ç‡§™ ‡§Æ‡•á‡§Ç ‡§™‡•Å‡§∞‡•Å‡§∑ ‡§î‡§∞ ‡§Æ‡§π‡§ø‡§≤‡§æ ‡§ï‡§æ‡§Ç‡§∏‡•ç‡§ü‡•á‡§¨‡§≤ ‡§®‡•á...
2,3,"‡§ï‡•ã‡§à ‡§≠‡•Ä ‡§ï‡§æ‡§Ç‡§ó‡•ç‡§∞‡•á‡§∏‡•Ä, ‡§ä‡§Ç‡§ö‡•Ä ‡§õ‡§§ ‡§™‡§∞, ‡§∞‡•á‡§≤‡§µ‡•á ‡§≤‡§æ‡§á‡§® ‡§™‡§∞, ‡§ä...","‡§ï‡•ã‡§à ‡§≠‡•Ä ‡§ï‡§æ‡§Ç‡§ó‡•ç‡§∞‡•á‡§∏‡•Ä , ‡§ä‡§Ç‡§ö‡•Ä ‡§õ‡§§ ‡§™‡§∞ , ‡§∞‡•á‡§≤‡§µ‡•á ‡§≤‡§æ‡§á‡§® ‡§™...",[],[],"[üôè, üòÇ, üëç]",[],[],[],[],"[folded hands, face with tears of joy, thumbs up]",[],"‡§ï‡•ã‡§à ‡§≠‡•Ä ‡§ï‡§æ‡§Ç‡§ó‡•ç‡§∞‡•á‡§∏‡•Ä, ‡§ä‡§Ç‡§ö‡•Ä ‡§õ‡§§ ‡§™‡§∞, ‡§∞‡•á‡§≤‡§µ‡•á ‡§≤‡§æ‡§á‡§® ‡§™‡§∞, ‡§ä...","‡§ï‡•ã‡§à ‡§≠‡•Ä ‡§ï‡§æ‡§Ç‡§ó‡•ç‡§∞‡•á‡§∏‡•Ä, ‡§ä‡§Ç‡§ö‡•Ä ‡§õ‡§§ ‡§™‡§∞, ‡§∞‡•á‡§≤‡§µ‡•á ‡§≤‡§æ‡§á‡§® ‡§™‡§∞, ‡§ä..."
3,4,‡§Ö‡§Ç‡§°‡§∞‡§µ‡§∞‡•ç‡§≤‡•ç‡§° ‡§°‡•â‡§® ‡§õ‡•ã‡§ü‡§æ ‡§∞‡§æ‡§ú‡§® ‡§ï‡•á ‡§≠‡§æ‡§à ‡§ï‡•ã ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§¶‡•ç‡§µ‡§æ...,‡§Ö‡§Ç‡§°‡§∞‡§µ‡§∞‡•ç‡§≤‡•ç‡§° ‡§°‡•â‡§® ‡§õ‡•ã‡§ü‡§æ ‡§∞‡§æ‡§ú‡§® ‡§ï‡•á ‡§≠‡§æ‡§à ‡§ï‡•ã ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§¶‡•ç‡§µ‡§æ...,[],[],[],[],[],[],[],[],[],‡§Ö‡§Ç‡§°‡§∞‡§µ‡§∞‡•ç‡§≤‡•ç‡§° ‡§°‡•â‡§® ‡§õ‡•ã‡§ü‡§æ ‡§∞‡§æ‡§ú‡§® ‡§ï‡•á ‡§≠‡§æ‡§à ‡§ï‡•ã ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§¶‡•ç‡§µ‡§æ...,‡§Ö‡§Ç‡§°‡§∞‡§µ‡§∞‡•ç‡§≤‡•ç‡§° ‡§°‡•â‡§® ‡§õ‡•ã‡§ü‡§æ ‡§∞‡§æ‡§ú‡§® ‡§ï‡•á ‡§≠‡§æ‡§à ‡§ï‡•ã ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§¶‡•ç‡§µ‡§æ...
4,5,RT @_Pb_swain_: ‡§á‡§® ‡§™‡§Ç‡§ö‡§∞ ‡§õ‡§æ‡§™‡•ã‡§Ç ‡§ï‡•ã ‡§ï‡•ã‡§® ‡§∏‡§Æ‡§ù‡§æ‡§è ‡§ï‡§ø ...,: ‡§á‡§® ‡§™‡§Ç‡§ö‡§∞ ‡§õ‡§æ‡§™‡•ã‡§Ç ‡§ï‡•ã ‡§ï‡•ã‡§® ‡§∏‡§Æ‡§ù‡§æ‡§è ‡§ï‡§ø ‡§â‡§®‡§ï‡•á ‡§∞‡•ã‡§ú‡§ó‡§æ‡§∞ ‡§Æ...,[],[],"[üëá, üòÇ, üòÇ, üòÇ, üòÇ]",[],[@_Pb_swain_],[],[RT],"[backhand index pointing down, face with tears...",[],RT : ‡§á‡§® ‡§™‡§Ç‡§ö‡§∞ ‡§õ‡§æ‡§™‡•ã‡§Ç ‡§ï‡•ã ‡§ï‡•ã‡§® ‡§∏‡§Æ‡§ù‡§æ‡§è ‡§ï‡§ø ‡§â‡§®‡§ï‡•á ‡§∞‡•ã‡§ú‡§ó‡§æ...,RT @_Pb_swain_: ‡§á‡§® ‡§™‡§Ç‡§ö‡§∞ ‡§õ‡§æ‡§™‡•ã‡§Ç ‡§ï‡•ã ‡§ï‡•ã‡§® ‡§∏‡§Æ‡§ù‡§æ‡§è ‡§ï‡§ø ...
5,6,"‡§™‡§∂‡•ç‡§ö‡§ø‡§Æ ‡§¨‡§Ç‡§ó‡§æ‡§≤ ‡§Æ‡•á‡§Ç ‡§´‡§ø‡§∞ ‡§π‡•Å‡§à ‡§Æ‡§æ‡§ì‡§µ‡§æ‡§¶‡§ø‡§Ø‡•ã‡§Ç ‡§ï‡•Ä ‡§µ‡§æ‡§™‡§∏‡•Ä, ...","‡§™‡§∂‡•ç‡§ö‡§ø‡§Æ ‡§¨‡§Ç‡§ó‡§æ‡§≤ ‡§Æ‡•á‡§Ç ‡§´‡§ø‡§∞ ‡§π‡•Å‡§à ‡§Æ‡§æ‡§ì‡§µ‡§æ‡§¶‡§ø‡§Ø‡•ã‡§Ç ‡§ï‡•Ä ‡§µ‡§æ‡§™‡§∏‡•Ä ,...","[#Maoist, #WestBengal]",[],[],[https://t.co/pP1AOvOv0b],[],[],[],[],"[maoist, west bengal]","‡§™‡§∂‡•ç‡§ö‡§ø‡§Æ ‡§¨‡§Ç‡§ó‡§æ‡§≤ ‡§Æ‡•á‡§Ç ‡§´‡§ø‡§∞ ‡§π‡•Å‡§à ‡§Æ‡§æ‡§ì‡§µ‡§æ‡§¶‡§ø‡§Ø‡•ã‡§Ç ‡§ï‡•Ä ‡§µ‡§æ‡§™‡§∏‡•Ä, ...","‡§™‡§∂‡•ç‡§ö‡§ø‡§Æ ‡§¨‡§Ç‡§ó‡§æ‡§≤ ‡§Æ‡•á‡§Ç ‡§´‡§ø‡§∞ ‡§π‡•Å‡§à ‡§Æ‡§æ‡§ì‡§µ‡§æ‡§¶‡§ø‡§Ø‡•ã‡§Ç ‡§ï‡•Ä ‡§µ‡§æ‡§™‡§∏‡•Ä, ..."
6,7,#Breaking-‡§ï‡§Ç‡§ó‡§®‡§æ ‡§Æ‡§æ‡§Æ‡§≤‡•á ‡§™‡§∞ ‡§¨‡•ã‡§≤‡•á ‡§Æ‡§®‡•ã‡§ú ‡§§‡§ø‡§µ‡§æ‡§∞‡•Ä-‡§ï‡§π‡§æ ...,#Breaking-‡§ï‡§Ç‡§ó‡§®‡§æ ‡§Æ‡§æ‡§Æ‡§≤‡•á ‡§™‡§∞ ‡§¨‡•ã‡§≤‡•á ‡§Æ‡§®‡•ã‡§ú ‡§§‡§ø‡§µ‡§æ‡§∞‡•Ä-‡§ï‡§π‡§æ ...,"[#Breaking, #Sushantsinghcase, #Kangana]",[],[],[https://t.co/szOTZWq1hI],[],[],[],[],"[breaking, sushantsinghcase, kangana]",-‡§ï‡§Ç‡§ó‡§®‡§æ ‡§Æ‡§æ‡§Æ‡§≤‡•á ‡§™‡§∞ ‡§¨‡•ã‡§≤‡•á ‡§Æ‡§®‡•ã‡§ú ‡§§‡§ø‡§µ‡§æ‡§∞‡•Ä-‡§ï‡§π‡§æ ‡§ß‡§Æ‡§ï‡•Ä ‡§Æ‡§ø‡§≤...,#Breaking-‡§ï‡§Ç‡§ó‡§®‡§æ ‡§Æ‡§æ‡§Æ‡§≤‡•á ‡§™‡§∞ ‡§¨‡•ã‡§≤‡•á ‡§Æ‡§®‡•ã‡§ú ‡§§‡§ø‡§µ‡§æ‡§∞‡•Ä-‡§ï‡§π‡§æ ...
7,8,@BasudebaTripat4: @Rajanspsingh1 ‡§Ö‡§ö‡•ç‡§õ‡§æ ‡§ï‡§ø‡§Ø‡§æ ‡§∏‡§æ...,": ‡§Ö‡§ö‡•ç‡§õ‡§æ ‡§ï‡§ø‡§Ø‡§æ ‡§∏‡§æ‡§≤‡•á ‡§ï‡§æ ‡§∏‡§∞ ‡§´‡•ã‡§°‡§º ‡§¶‡§ø‡§Ø‡§æ , , ‡§ó‡§∞‡•ç‡§¶‡§®...",[],[],[],[],"[@BasudebaTripat4, @Rajanspsingh1]",[],[],[],[],": ‡§Ö‡§ö‡•ç‡§õ‡§æ ‡§ï‡§ø‡§Ø‡§æ ‡§∏‡§æ‡§≤‡•á ‡§ï‡§æ ‡§∏‡§∞ ‡§´‡•ã‡§°‡§º ‡§¶‡§ø‡§Ø‡§æ,, ‡§ó‡§∞‡•ç‡§¶‡§® ‡§§...",@BasudebaTripat4: @Rajanspsingh1 ‡§Ö‡§ö‡•ç‡§õ‡§æ ‡§ï‡§ø‡§Ø‡§æ ‡§∏‡§æ...
8,9,‡§π‡•à‡§¶‡§∞‡§æ‡§¨‡§æ‡§¶ ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§µ‡§ø‡§ß‡§æ‡§Ø‡§ï ‡§∞‡§æ‡§ú‡§æ ‡§∏‡§ø‡§Ç‡§π ‡§ï‡•Ä ‡§¨‡§π‡§® ‡§Æ‡§æ‡§Ø‡§æ ‡§¶...,‡§π‡•à‡§¶‡§∞‡§æ‡§¨‡§æ‡§¶ ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§µ‡§ø‡§ß‡§æ‡§Ø‡§ï ‡§∞‡§æ‡§ú‡§æ ‡§∏‡§ø‡§Ç‡§π ‡§ï‡•Ä ‡§¨‡§π‡§® ‡§Æ‡§æ‡§Ø‡§æ ‡§¶...,[],[],[],[],[],[],[],[],[],‡§π‡•à‡§¶‡§∞‡§æ‡§¨‡§æ‡§¶ ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§µ‡§ø‡§ß‡§æ‡§Ø‡§ï ‡§∞‡§æ‡§ú‡§æ ‡§∏‡§ø‡§Ç‡§π ‡§ï‡•Ä ‡§¨‡§π‡§® ‡§Æ‡§æ‡§Ø‡§æ ‡§¶...,‡§π‡•à‡§¶‡§∞‡§æ‡§¨‡§æ‡§¶ ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§µ‡§ø‡§ß‡§æ‡§Ø‡§ï ‡§∞‡§æ‡§ú‡§æ ‡§∏‡§ø‡§Ç‡§π ‡§ï‡•Ä ‡§¨‡§π‡§® ‡§Æ‡§æ‡§Ø‡§æ ‡§¶...
9,10,"‡§ï‡§Æ‡§≤‡§®‡§æ‡§• ‡§ï‡•á ‡§∞‡§æ‡§ú ‡§Æ‡•á‡§Ç 100,‚Çπ ‡§Æ‡•á‡§Ç 100‡§Ø‡•Ç‡§®‡§ø‡§ü ‡§¨‡§ø‡§ú‡§≤‡•Ä ‡§Æ‡§ø‡§≤...","‡§ï‡§Æ‡§≤‡§®‡§æ‡§• ‡§ï‡•á ‡§∞‡§æ‡§ú ‡§Æ‡•á‡§Ç , ‚Çπ ‡§Æ‡•á‡§Ç 100‡§Ø‡•Ç‡§®‡§ø‡§ü ‡§¨‡§ø‡§ú‡§≤‡•Ä ‡§Æ‡§ø‡§≤ ‡§∞...",[],[],[],[],[],"[100, 100, 500, 1000]",[],[],[],"‡§ï‡§Æ‡§≤‡§®‡§æ‡§• ‡§ï‡•á ‡§∞‡§æ‡§ú ‡§Æ‡•á‡§Ç 100,‚Çπ ‡§Æ‡•á‡§Ç 100‡§Ø‡•Ç‡§®‡§ø‡§ü ‡§¨‡§ø‡§ú‡§≤‡•Ä ‡§Æ‡§ø‡§≤...","‡§ï‡§Æ‡§≤‡§®‡§æ‡§• ‡§ï‡•á ‡§∞‡§æ‡§ú ‡§Æ‡•á‡§Ç 100,‚Çπ ‡§Æ‡•á‡§Ç 100‡§Ø‡•Ç‡§®‡§ø‡§ü ‡§¨‡§ø‡§ú‡§≤‡•Ä ‡§Æ‡§ø‡§≤..."


In [37]:
test['label'] = np.array(fin_outputs)

In [38]:
len(fin_outputs)

1630

In [39]:
len(test.full_tweet.values)

1630

In [40]:
test.sample(10)

Unnamed: 0,tweet_id,full_tweet,tweet_raw_text,hashtags,smiley,emoji,url,mentions,numerals,reserved_word,emotext,segmented_hash,clean,tweet,label
545,569,‡§∏‡•ç‡§µ‡§ö‡•ç‡§õ ‡§≠‡§æ‡§∞‡§§-‡§∏‡•ç‡§µ‡§∏‡•ç‡§• ‡§≠‡§æ‡§∞‡§§' ‡§ï‡•á ‡§∏‡§Ç‡§ï‡§≤‡•ç‡§™ ‡§ï‡•á ‡§∏‡§æ‡§• #Swa...,‡§∏‡•ç‡§µ‡§ö‡•ç‡§õ ‡§≠‡§æ‡§∞‡§§-‡§∏‡•ç‡§µ‡§∏‡•ç‡§• ‡§≠‡§æ‡§∞‡§§' ‡§ï‡•á ‡§∏‡§Ç‡§ï‡§≤‡•ç‡§™ ‡§ï‡•á ‡§∏‡§æ‡§• ‡§Æ‡•á‡§Ç ...,[#SwachhSurvekshan2020],[],[],[],[@narendramodi],[19],[],[],[swachh survekshan 2020],‡§∏‡•ç‡§µ‡§ö‡•ç‡§õ ‡§≠‡§æ‡§∞‡§§-‡§∏‡•ç‡§µ‡§∏‡•ç‡§• ‡§≠‡§æ‡§∞‡§§' ‡§ï‡•á ‡§∏‡§Ç‡§ï‡§≤‡•ç‡§™ ‡§ï‡•á ‡§∏‡§æ‡§• ‡§Æ‡•á...,‡§∏‡•ç‡§µ‡§ö‡•ç‡§õ ‡§≠‡§æ‡§∞‡§§-‡§∏‡•ç‡§µ‡§∏‡•ç‡§• ‡§≠‡§æ‡§∞‡§§' ‡§ï‡•á ‡§∏‡§Ç‡§ï‡§≤‡•ç‡§™ ‡§ï‡•á ‡§∏‡§æ‡§• #Swa...,0
1191,1215,‡§Æ‡•à‡§Ç ‡§Ö‡§™‡§®‡•Ä Jio ‡§∏‡§ø‡§Æ ‡§ï‡•á ‡§è‡§° ‡§∏‡•á ‡§∂‡§æ‡§π‡§∞‡•Å‡§ñ ‡§ñ‡§æ‡§® ‡§ï‡•ã ‡§®‡§ø‡§ï‡§æ‡§≤ ...,‡§Æ‡•à‡§Ç ‡§Ö‡§™‡§®‡•Ä Jio ‡§∏‡§ø‡§Æ ‡§ï‡•á ‡§è‡§° ‡§∏‡•á ‡§∂‡§æ‡§π‡§∞‡•Å‡§ñ ‡§ñ‡§æ‡§® ‡§ï‡•ã ‡§®‡§ø‡§ï‡§æ‡§≤ ...,[],[],[],[],[],[],[],[],[],‡§Æ‡•à‡§Ç ‡§Ö‡§™‡§®‡•Ä Jio ‡§∏‡§ø‡§Æ ‡§ï‡•á ‡§è‡§° ‡§∏‡•á ‡§∂‡§æ‡§π‡§∞‡•Å‡§ñ ‡§ñ‡§æ‡§® ‡§ï‡•ã ‡§®‡§ø‡§ï‡§æ‡§≤ ...,‡§Æ‡•à‡§Ç ‡§Ö‡§™‡§®‡•Ä Jio ‡§∏‡§ø‡§Æ ‡§ï‡•á ‡§è‡§° ‡§∏‡•á ‡§∂‡§æ‡§π‡§∞‡•Å‡§ñ ‡§ñ‡§æ‡§® ‡§ï‡•ã ‡§®‡§ø‡§ï‡§æ‡§≤ ...,1
245,269,"@LakheraSatish: ‡§¶‡•á‡§ñ ‡§ö‡•Ä‡§®‡•Ä ‡§∏‡•à‡§®‡§ø‡§ï‡•ã‡§Ç ‡§ï‡•Ä ‡§ï‡§¨‡•ç‡§∞, ‡§ï‡§æ‡§Ç‡§ó...",": ‡§¶‡•á‡§ñ ‡§ö‡•Ä‡§®‡•Ä ‡§∏‡•à‡§®‡§ø‡§ï‡•ã‡§Ç ‡§ï‡•Ä ‡§ï‡§¨‡•ç‡§∞ , ‡§ï‡§æ‡§Ç‡§ó‡•ç‡§∞‡•á‡§∏ ‡§ï‡§æ ‡§ü‡•Ç‡§ü...",[],[],[],[],[@LakheraSatish],[],[],[],[],": ‡§¶‡•á‡§ñ ‡§ö‡•Ä‡§®‡•Ä ‡§∏‡•à‡§®‡§ø‡§ï‡•ã‡§Ç ‡§ï‡•Ä ‡§ï‡§¨‡•ç‡§∞, ‡§ï‡§æ‡§Ç‡§ó‡•ç‡§∞‡•á‡§∏ ‡§ï‡§æ ‡§ü‡•Ç‡§ü ‡§∞...","@LakheraSatish: ‡§¶‡•á‡§ñ ‡§ö‡•Ä‡§®‡•Ä ‡§∏‡•à‡§®‡§ø‡§ï‡•ã‡§Ç ‡§ï‡•Ä ‡§ï‡§¨‡•ç‡§∞, ‡§ï‡§æ‡§Ç‡§ó...",1
173,197,‡§™‡§§‡•ç‡§∞ ‡§ï‡•á ‡§∂‡•Å‡§∞‡•Å‡§Ü‡§§ ‡§∏‡•á‡§®‡§æ ‡§ï‡•á ‡§≤‡•ã‡§ó‡•ã‡§Ç ‡§ï‡•ã ‡§®‡§µ ‡§µ‡§∞‡•ç‡§∑ ‡§ï‡•Ä ‡§∂‡•Å‡§≠...,‡§™‡§§‡•ç‡§∞ ‡§ï‡•á ‡§∂‡•Å‡§∞‡•Å‡§Ü‡§§ ‡§∏‡•á‡§®‡§æ ‡§ï‡•á ‡§≤‡•ã‡§ó‡•ã‡§Ç ‡§ï‡•ã ‡§®‡§µ ‡§µ‡§∞‡•ç‡§∑ ‡§ï‡•Ä ‡§∂‡•Å‡§≠...,[],[],[],[],[],[],[],[],[],‡§™‡§§‡•ç‡§∞ ‡§ï‡•á ‡§∂‡•Å‡§∞‡•Å‡§Ü‡§§ ‡§∏‡•á‡§®‡§æ ‡§ï‡•á ‡§≤‡•ã‡§ó‡•ã‡§Ç ‡§ï‡•ã ‡§®‡§µ ‡§µ‡§∞‡•ç‡§∑ ‡§ï‡•Ä ‡§∂‡•Å‡§≠...,‡§™‡§§‡•ç‡§∞ ‡§ï‡•á ‡§∂‡•Å‡§∞‡•Å‡§Ü‡§§ ‡§∏‡•á‡§®‡§æ ‡§ï‡•á ‡§≤‡•ã‡§ó‡•ã‡§Ç ‡§ï‡•ã ‡§®‡§µ ‡§µ‡§∞‡•ç‡§∑ ‡§ï‡•Ä ‡§∂‡•Å‡§≠...,1
1116,1140,Redmi 9 Prime ‡§î‡§∞ Redmi 9 ‡§ï‡•Ä ‡§∏‡•á‡§≤ ‡§Ü‡§ú ‡§¶‡•ã‡§™‡§π‡§∞ 12 ‡§¨‡§ú...,"Redmi Prime ‡§î‡§∞ Redmi ‡§ï‡•Ä ‡§∏‡•á‡§≤ ‡§Ü‡§ú ‡§¶‡•ã‡§™‡§π‡§∞ ‡§¨‡§ú‡•á , ‡§ú‡§æ...",[],[],[],[https://t.co/KzbthV3juh],[],"[9, 9, 12]",[],[],[],Redmi 9 Prime ‡§î‡§∞ Redmi 9 ‡§ï‡•Ä ‡§∏‡•á‡§≤ ‡§Ü‡§ú ‡§¶‡•ã‡§™‡§π‡§∞ 12 ‡§¨‡§ú...,Redmi 9 Prime ‡§î‡§∞ Redmi 9 ‡§ï‡•Ä ‡§∏‡•á‡§≤ ‡§Ü‡§ú ‡§¶‡•ã‡§™‡§π‡§∞ 12 ‡§¨‡§ú...,0
78,79,‡§≠‡§æ‡§∞‡§§ ‡§®‡•á ‡§ö‡•Ä‡§® ‡§∏‡•á ‡§∏‡•Ä‡§Æ‡§æ ‡§™‡§∞ ‡§§‡•à‡§®‡§æ‡§§ ‡§â‡§∏‡§ï‡•á ‡§∏‡•à‡§®‡§ø‡§ï‡•ã‡§Ç ‡§ï‡•ã ‡§Ö...,‡§≠‡§æ‡§∞‡§§ ‡§®‡•á ‡§ö‡•Ä‡§® ‡§∏‡•á ‡§∏‡•Ä‡§Æ‡§æ ‡§™‡§∞ ‡§§‡•à‡§®‡§æ‡§§ ‡§â‡§∏‡§ï‡•á ‡§∏‡•à‡§®‡§ø‡§ï‡•ã‡§Ç ‡§ï‡•ã ‡§Ö...,[],[],[],[],[],[],[],[],[],‡§≠‡§æ‡§∞‡§§ ‡§®‡•á ‡§ö‡•Ä‡§® ‡§∏‡•á ‡§∏‡•Ä‡§Æ‡§æ ‡§™‡§∞ ‡§§‡•à‡§®‡§æ‡§§ ‡§â‡§∏‡§ï‡•á ‡§∏‡•à‡§®‡§ø‡§ï‡•ã‡§Ç ‡§ï‡•ã ‡§Ö...,‡§≠‡§æ‡§∞‡§§ ‡§®‡•á ‡§ö‡•Ä‡§® ‡§∏‡•á ‡§∏‡•Ä‡§Æ‡§æ ‡§™‡§∞ ‡§§‡•à‡§®‡§æ‡§§ ‡§â‡§∏‡§ï‡•á ‡§∏‡•à‡§®‡§ø‡§ï‡•ã‡§Ç ‡§ï‡•ã ‡§Ö...,1
104,105,‡§¶‡§ø‡§≤ ‡§¶‡§π‡§≤‡§æ ‡§¶‡•á‡§®‡•á ‡§µ‡§æ‡§≤‡•Ä ‡§ñ‡§¨‡§∞‡•§ 6 ‡§∏‡§æ‡§≤ ‡§ï‡•Ä ‡§¨‡§ö‡•ç‡§ö‡•Ä ‡§ï‡•á ‡§∏‡§æ‡§• ...,‡§¶‡§ø‡§≤ ‡§¶‡§π‡§≤‡§æ ‡§¶‡•á‡§®‡•á ‡§µ‡§æ‡§≤‡•Ä ‡§ñ‡§¨‡§∞‡•§ ‡§∏‡§æ‡§≤ ‡§ï‡•Ä ‡§¨‡§ö‡•ç‡§ö‡•Ä ‡§ï‡•á ‡§∏‡§æ‡§• ‡§∞‡•á...,[#‡§ú],[],[],[],[],[6],[],[],[‡§ú],‡§¶‡§ø‡§≤ ‡§¶‡§π‡§≤‡§æ ‡§¶‡•á‡§®‡•á ‡§µ‡§æ‡§≤‡•Ä ‡§ñ‡§¨‡§∞‡•§ 6 ‡§∏‡§æ‡§≤ ‡§ï‡•Ä ‡§¨‡§ö‡•ç‡§ö‡•Ä ‡§ï‡•á ‡§∏‡§æ‡§• ...,‡§¶‡§ø‡§≤ ‡§¶‡§π‡§≤‡§æ ‡§¶‡•á‡§®‡•á ‡§µ‡§æ‡§≤‡•Ä ‡§ñ‡§¨‡§∞‡•§ 6 ‡§∏‡§æ‡§≤ ‡§ï‡•Ä ‡§¨‡§ö‡•ç‡§ö‡•Ä ‡§ï‡•á ‡§∏‡§æ‡§• ...,1
1055,1079,"‡§Æ‡•â‡§Æ-‡§°‡•à‡§° ‡§¨‡§®‡§®‡•á ‡§µ‡§æ‡§≤‡•á ‡§π‡•à‡§Ç ‡§µ‡§ø‡§∞‡•Å‡§∑‡•ç‡§ï‡§æ, ‡§µ‡§ø‡§∞‡§æ‡§ü ‡§¨‡•ã‡§≤‡•á- ‡§Ö‡§µ...","‡§Æ‡•â‡§Æ-‡§°‡•à‡§° ‡§¨‡§®‡§®‡•á ‡§µ‡§æ‡§≤‡•á ‡§π‡•à‡§Ç ‡§µ‡§ø‡§∞‡•Å‡§∑‡•ç‡§ï‡§æ , ‡§µ‡§ø‡§∞‡§æ‡§ü ‡§¨‡•ã‡§≤‡•á- ...","[#virushka, #RCB, #TeamIndia, #ViratKohli]",[],[],"[https://t.co/8x4k3Mkk1g, https://t.co/YJBZvv4...",[],[],[],[],"[virus hk a, rcb, team india, virat kohli]","‡§Æ‡•â‡§Æ-‡§°‡•à‡§° ‡§¨‡§®‡§®‡•á ‡§µ‡§æ‡§≤‡•á ‡§π‡•à‡§Ç ‡§µ‡§ø‡§∞‡•Å‡§∑‡•ç‡§ï‡§æ, ‡§µ‡§ø‡§∞‡§æ‡§ü ‡§¨‡•ã‡§≤‡•á- ‡§Ö‡§µ...","‡§Æ‡•â‡§Æ-‡§°‡•à‡§° ‡§¨‡§®‡§®‡•á ‡§µ‡§æ‡§≤‡•á ‡§π‡•à‡§Ç ‡§µ‡§ø‡§∞‡•Å‡§∑‡•ç‡§ï‡§æ, ‡§µ‡§ø‡§∞‡§æ‡§ü ‡§¨‡•ã‡§≤‡•á- ‡§Ö‡§µ...",0
916,940,#CoronaUpdates : ‡§∏‡§¨‡§∏‡•á ‡§Ö‡§ß‡§ø‡§ï ‡§∏‡§Ç‡§ï‡•ç‡§∞‡§Æ‡§ø‡§§‡•ã‡§Ç ‡§µ‡§æ‡§≤‡§æ ‡§¶‡•Ç‡§∏...,: ‡§∏‡§¨‡§∏‡•á ‡§Ö‡§ß‡§ø‡§ï ‡§∏‡§Ç‡§ï‡•ç‡§∞‡§Æ‡§ø‡§§‡•ã‡§Ç ‡§µ‡§æ‡§≤‡§æ ‡§¶‡•Ç‡§∏‡§∞‡§æ ‡§¶‡•á‡§∂ ‡§¨‡§®‡§æ ‡§≠‡§æ‡§∞...,"[#CoronaUpdates, #CoronavirusIndia, #CoronaUpd...",[],[],[https://t.co/BRGQsNNtEe],[],[],[],[],"[corona updates, coronavirus india, corona upd...",: ‡§∏‡§¨‡§∏‡•á ‡§Ö‡§ß‡§ø‡§ï ‡§∏‡§Ç‡§ï‡•ç‡§∞‡§Æ‡§ø‡§§‡•ã‡§Ç ‡§µ‡§æ‡§≤‡§æ ‡§¶‡•Ç‡§∏‡§∞‡§æ ‡§¶‡•á‡§∂ ‡§¨‡§®‡§æ ‡§≠‡§æ...,#CoronaUpdates : ‡§∏‡§¨‡§∏‡•á ‡§Ö‡§ß‡§ø‡§ï ‡§∏‡§Ç‡§ï‡•ç‡§∞‡§Æ‡§ø‡§§‡•ã‡§Ç ‡§µ‡§æ‡§≤‡§æ ‡§¶‡•Ç‡§∏...,0
470,494,@rashtrapatibhvn @narendramodi ‡§™‡§æ‡§™‡•Ä ‡§Ö‡§ß‡§∞‡•ç‡§Æ‡•Ä ‡§Æ‡§®...,‡§™‡§æ‡§™‡•Ä ‡§Ö‡§ß‡§∞‡•ç‡§Æ‡•Ä ‡§Æ‡§®‡§π‡•Ç‡§∏ ‡§ï‡§≤‡§Ç‡§ï ‡§ï‡•ã ‡§ú‡§®‡•ç‡§Æ‡§¶‡§ø‡§® ‡§ï‡•Ä ‡§¨‡§ß‡§æ‡§à ‡§¶‡•á‡§®...,[],[],[],[],"[@rashtrapatibhvn, @narendramodi]",[100],[],[],[],‡§™‡§æ‡§™‡•Ä ‡§Ö‡§ß‡§∞‡•ç‡§Æ‡•Ä ‡§Æ‡§®‡§π‡•Ç‡§∏ ‡§ï‡§≤‡§Ç‡§ï ‡§ï‡•ã ‡§ú‡§®‡•ç‡§Æ‡§¶‡§ø‡§® ‡§ï‡•Ä ‡§¨‡§ß‡§æ‡§à...,@rashtrapatibhvn @narendramodi ‡§™‡§æ‡§™‡•Ä ‡§Ö‡§ß‡§∞‡•ç‡§Æ‡•Ä ‡§Æ‡§®...,1


In [41]:
# def label_decode(val):
#     return labels[val]
# test.label = test.label.apply(label_decode)

In [42]:

test.to_csv(path_or_buf='../temp/labels/'+lab+'.txt', index=False, columns = ['tweet_id', 'label'] )