<a href="https://colab.research.google.com/github/unknown-spec10/Data-Science/blob/main/Tweeter_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer,BertForSequenceClassification,AdamW,get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset,DataLoader,random_split


In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('stopwords')

In [None]:
df=pd.read_csv("/content/training.1600000.processed.noemoticon.csv",encoding="ISO-8859-1",header=None)

In [None]:
short_df=df.sample(500)


In [None]:
short_df.shape

In [None]:
short_df.to_csv('short_twitter.csv')

#stating with shorten data

In [None]:
short_df=pd.read_csv('/content/short_twitter.csv',header=None)

In [None]:
df=short_df

In [None]:
df.drop(columns=[0],inplace=True)

In [None]:
df.shape

In [None]:
df=df.iloc[1:,:]

In [None]:
df.head()

In [None]:
new_df=df[[1,6]]

In [None]:
new_df.head()

In [None]:
new_df.columns=["sentiment","text"]

In [None]:
new_df

#EDA steps

In [None]:
new_df['sentiment'].value_counts().plot(kind="bar")

In [None]:
new_df['sentiment']=new_df['sentiment'].apply(lambda x : 1 if x==4 else x)

In [None]:
all_labels=list(new_df["sentiment"].unique())

In [None]:
all_labels

#preprocessing on data

In [None]:
stemmer=PorterStemmer()

In [None]:
#stopwords.words('english')

In [None]:
def stemming(text):
  text=re.sub("[^a-zA-Z]",' ',text)
  text=text.lower()
  text=text.split(" ")
  #print(text)
  modified_text=[]
  for i in text:
    if i not in stopwords.words('english'):
      modified_text.append(stemmer.stem(i))
  modified_text=" ".join(modified_text)
  return modified_text


In [None]:
stemming("I am a good boy Do you know this")

In [None]:
new_df.head()

In [None]:
new_df['trans_text']=new_df['text'].apply(stemming)

In [None]:
new_df.head()

In [None]:
new_df.iloc[2]['trans_text']

In [None]:
texts=new_df['trans_text'].values
labels=new_df['sentiment'].values

In [None]:
labels

#Model assignment

In [None]:
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
tokenizer.encode_plus(new_df['trans_text'].iloc[0],add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 100,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt')

In [None]:
def tokenization_map(sentences,lab=None):
  global labels
  input_ids=[]
  attention_mask=[]
  for sentence in sentences:
    ids_dict=tokenizer.encode_plus(sentence,add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 100,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',
                                   truncation=True)
    input_ids.append(ids_dict['input_ids'])
    attention_mask.append(ids_dict['attention_mask'])
  ids=torch.cat(input_ids,dim=0)
  att_mask=torch.cat(attention_mask,dim=0)
  labels=torch.tensor(labels)
  if lab!=None:
    return ids,att_mask,lab
  else:
    return ids,att_mask



In [None]:
input_ids,attention_masks,lebels=tokenization_map(texts,labels)

In [None]:
input_ids.shape

In [None]:
attention_masks.shape

In [None]:
labels.shape

#converting to tensor DataSet

In [None]:
dataset=TensorDataset(input_ids,attention_masks,labels)

In [None]:
len(dataset)

In [None]:
##spliting the dataset
train_size=int(0.8*len(dataset))
val_size=len(dataset)-train_size

In [None]:
train_dataset, val_dataset= random_split(dataset,[train_size,val_size])

In [None]:
len(train_dataset)

#converting to dataLoaders

In [None]:
batch_size=32
num_workers=4

In [None]:
train_dataloader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True,num_workers=num_workers)

val_dataloader=DataLoader(val_dataset,batch_size=1,shuffle=True)

#enabling GPU

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#model assignement for training

In [None]:
#help(BertForSequenceClassification.from_pretrained)

In [None]:
model=BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=len(labels),
                                                    )

In [None]:
model.to(device)

In [None]:
optimizer=torch.optim.AdamW(model.parameters(),lr=6e-6,eps=1e-8)

#Start Training

In [None]:
num_epoch=10
total_steps=num_epoch*batch_size
scheduler=get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)

In [None]:
model.train()


In [None]:
print(len(train_dataloader))

In [None]:
total_step=len(train_dataloader)

In [None]:
total_steps

In [None]:
for i in range(num_epoch):
  total_loss=0
  for id,(ipds,att,label) in enumerate(train_dataloader):
    ipds=ipds.to(device)
    att=att.to(device)
    label=label.to(device)
    loss=model(ipds,attention_mask=att,labels=label,token_type_ids=None)[0]## token_type_ids useful for qa tasks
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step()

    total_loss+=loss.item()
  print('Epoch: {}, loss: {:.4f}'.format(i+1,total_loss/total_step))

#validation

In [None]:
test_str=new_df.iloc[2]['trans_text']
test_str

In [None]:
test_dict=tokenizer.encode_plus(test_str,add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 100,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',
                                   truncation=True)

In [None]:
test_dict

In [None]:
model.eval()

In [None]:
input_id, attention_mask = tokenization_map([test_str])

g_label = model(input_id.to(device), token_type_ids=None, attention_mask=attention_mask.to(device))[0]
pred = torch.max(g_label, 1)[1][0].item()

print('\n')
print(pred)
print('Predict: {}'.format(all_labels[pred]))
print('GT: {}'.format(all_labels[label]))
print('---------------------------------------------------------------------------------------------\n')