In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install emoji
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import re
import emoji
from nltk.stem import PorterStemmer as ps
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf

Collecting emoji
  Downloading emoji-1.4.1.tar.gz (185 kB)
[?25l[K     |█▊                              | 10 kB 21.6 MB/s eta 0:00:01[K     |███▌                            | 20 kB 28.2 MB/s eta 0:00:01[K     |█████▎                          | 30 kB 18.5 MB/s eta 0:00:01[K     |███████                         | 40 kB 12.8 MB/s eta 0:00:01[K     |████████▉                       | 51 kB 5.5 MB/s eta 0:00:01[K     |██████████▋                     | 61 kB 5.7 MB/s eta 0:00:01[K     |████████████▍                   | 71 kB 4.7 MB/s eta 0:00:01[K     |██████████████▏                 | 81 kB 5.2 MB/s eta 0:00:01[K     |████████████████                | 92 kB 5.7 MB/s eta 0:00:01[K     |█████████████████▊              | 102 kB 5.2 MB/s eta 0:00:01[K     |███████████████████▌            | 112 kB 5.2 MB/s eta 0:00:01[K     |█████████████████████▎          | 122 kB 5.2 MB/s eta 0:00:01[K     |███████████████████████         | 133 kB 5.2 MB/s eta 0:00:01[K     |███████

In [None]:
csv_path = "/content/drive/MyDrive/ENTHIRE/airline_sentiment_analysis.csv"

class airline_data():
    def __init__(self,data_dir_path):
        self.df = pd.read_csv(data_dir_path)                                        #loading the csv file using pandas
        self.df = pd.concat([self.df["text"],self.df["airline_sentiment"]],axis=1) #concatenating the neccesary infromation from the data
    
    def process_text(self,text):
        new_text = text.lower()                                   #making all the characters lower case 
        new_text = re.sub(r'@\w+', '', new_text)                  # Remove @s
        new_text = re.sub(r'#', '', new_text)                     # Remove hashtags
        new_text = re.sub(r':', ' ', emoji.demojize(new_text))    # Turn emojis into words
        new_text = re.sub(r'http\S+', '',new_text)                # Remove URLs
        new_text = re.sub(r'\$\S+', 'dollar', new_text)           # Change dollar amounts to dollar
        new_text = re.sub(r'[^a-z0-9\s]', '', new_text)           # Remove punctuation
        new_text = re.sub(r'[0-9]+', 'number', new_text)          # Change number values to number
        new_text = new_text.split(" ")                            # splits the text into a list of strings after breaking the given text by the specified separator in our case it is " ".
        new_text = list(map(lambda x: ps().stem(x), new_text))    # Stemming the words
        new_text = list(map(lambda x: x.strip(), new_text))       # Stripping whitespace from the words
        if '' in new_text:
            new_text.remove('')
        return new_text                                           # returns sentence of words in the form of a list 

    def preprocess_data(self):
      self.Texts = self.df["text"].apply(self.process_text)                                                #cleans all the texts using the process text function
      sentiment_ordering = ['negative','positive']
      self.labels = self.df["airline_sentiment"].apply(lambda x:sentiment_ordering.index(x))               #converts the positive and negative labels to 1 and 0 respecctively
      

    def retrieve_vocab_info(self):
      self.preprocess_data()
      vocabulary = set()
      for text in self.Texts:                                                                               #goes through all the words in the data and adds the distinct words to the vocabulary
          for word in text:
                  vocabulary.add(word)

      self.vocab_length = len(vocabulary)   
      self.max_seq_length = 0
      for text in self.Texts:                                                                               #finds the length of sentence with maximum length out of all the sequences in the Texts.  
          if len(text) > self.max_seq_length:
              self.max_seq_length = len(text)
      return self.vocab_length,self.max_seq_length

    def tokenize_words(self):
      self.retrieve_vocab_info()                                                                             #retrieves information about the vocabulary length and maximum seqence length  
      self.training_sentences,self.testing_sentences,self.y_train,self.y_test = train_test_split(self.Texts,self.labels,train_size=0.7,random_state=10)  #splits the data into training and testing data
      tokenizer = Tokenizer(num_words=self.vocab_length,oov_token="<OOV>")  
      tokenizer.fit_on_texts(self.training_sentences)                                                        # maps all  distinct words in the training_sentences to numbers 
      word_index = tokenizer.word_index                                                                      # tokenizer.word_index is dictionary with words as keys and numbers as values formed by fitting the tokenizer on the training sentences
      self.training_sequences = tokenizer.texts_to_sequences(self.training_sentences)                        #creates sequences of tokens representing each sentence
      self.X_train = pad_sequences(self.training_sequences, maxlen=self.max_seq_length, padding='post')      #pads the sequence with zeros at the end to regularise the length of all the sequences to the maximum sequence length

      self.testing_sequences = tokenizer.texts_to_sequences(self.testing_sentences)
      self.X_test = pad_sequences(self.testing_sequences, maxlen=self.max_seq_length, padding='post')
      
      pickle.dump(tokenizer,open("/content/drive/MyDrive/ENTHIRE/GRU_Model/tokenizer_file3.pkl","wb"))       #saving the tokenizer using pickle so it can be used while deploying the model using fastAPI

      return self.X_train,self.X_test,self.y_train,self.y_test 

# DATA LOADING 
data = airline_data(csv_path)  
vocab_length,max_seq_length = data.retrieve_vocab_info()
X_train,X_test,y_train,y_test = data.tokenize_words() 

In [None]:

import lightgbm as lgb
import joblib
from sklearn import metrics
from sklearn.metrics import accuracy_score,f1_score
from sklearn import preprocessing
import os

In [None]:
# %%time
RESULTS_FOLDER="/content/drive/MyDrive/ENTHIRE/"

#representing the dataset in the form that lightGBM Dataset class
d_train=lgb.Dataset(X_train, label=y_train)

#Specifying the parameter
params={}
params['learning_rate']=0.03
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='binary' #Binary target feature
params['metric']='binary_logloss' #metric for binary classification
params['max_depth']=10

#training the model on training_data for 100 epochs
model = lgb.train(params,d_train,100)


# Save the model
joblib.dump(model, os.path.join(RESULTS_FOLDER, 'LGBM_Model_2.pkl'))

['/content/drive/MyDrive/ENTHIRE/LGBM_Model_2.pkl']

In [None]:
y_pred = model.predict(X_test)
y_pred = y_pred.round(0) #if value less than or equal 0.5 it outputs 0 else it outputs 1.
print("ACCURACY: {}".format(accuracy_score(y_pred,y_test))) 
print("F1_SCORE: {}".format(f1_score(y_pred,y_test))) 

ACCURACY: 0.8362691308114352
F1_SCORE: 0.40752351097178685
