In [15]:
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers

In [16]:
class Model:
    def __init__(self):
        self.X_train= None   #train data before splitting validation data
        self.X_test= None    #test data
        self.y_train= None   #train data before splitting validation data
        self.y_test= None    #test data
        self.X_train_seq= None  #train data converted to sequences
        self.X_test_seq= None   #test data converted to sequences
        
        self.y_train_le= None   #y_train after applying fit_transform
        self.y_test_le= None    #y_test after applying transform
        self.y_train_oh = None  #y_train after one hot enconding
        self.y_test_oh = None   #y_test after one hot encoding
        
        self.X_train_rest= None  #X_train after splitting validation data
        self.X_valid= None       #X_valid - validation data
        self.y_train_rest= None  #y_train after splitting validation data
        self.y_valid= None       #y_valid - validation data
        
        self.token= None         
        
        self.epoch_stop= 4      #no. of epochs after which model starts overfitting
        
        self.NB_WORDS = 10000  # Parameter indicating the number of words we'll put in the dictionary
        self.VAL_SIZE = 1000  # Size of the validation set
        self.NB_START_EPOCHS = 20  # Number of epochs for training
        self.BATCH_SIZE = 512  # Size of the batches
        
        self.base_model= None   
        self.reduced_model= None
        self.reg_model= None
        self.drop_model= None
        
        self.main_model= None
        
        self.history= None
        
        
    def remove_stopwords(self,input_text):
        '''function to remove some stopwords, which are not useful for analysis'''
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 
    
    def remove_mentions(self,input_text):
        '''function to remove @ from every text'''
        return re.sub(r'@\w+', '', input_text)    
    
    def one_hot_seq(self,seqs):
        '''function to convert sequences into one hot encoding'''
        ohs = np.zeros((len(seqs), self.NB_WORDS))
        for i, s in enumerate(seqs):
            ohs[i, s] = 1.
        return ohs
        
    def tokenize(self):
        
        self.token = Tokenizer(num_words=self.NB_WORDS,
                       filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                       lower=True,
                       split=" ")
        self.token.fit_on_texts(self.X_train)
    
    def tokennize_to_seq(self):
        '''converts texts to sequrnces'''
        self.X_train_seq = self.token.texts_to_sequences(self.X_train)
        self.X_test_seq = self.token.texts_to_sequences(self.X_test)
    
    def encoders(self):
        le = LabelEncoder()
        self.y_train_le = le.fit_transform(self.y_train)
        self.y_test_le = le.transform(self.y_test)
        self.y_train_oh = to_categorical(self.y_train_le)
        self.y_test_oh = to_categorical(self.y_test_le)
    
    def get_valid_data(self):
        self.X_train_rest, self.X_valid, self.y_train_rest, self.y_valid = train_test_split(self.X_train_oh, self.y_train_oh, test_size=0.1, random_state=1)

    def base_model_fun(self):
        '''base model'''
        self.base_model = models.Sequential()
        self.base_model.add(layers.Dense(64, activation='relu', input_shape=(self.NB_WORDS,)))
        self.base_model.add(layers.Dense(64, activation='relu'))
        self.base_model.add(layers.Dense(2, activation='softmax'))
        
        
    def reduced_model_fun(self):
        '''reducing the complexity of the base model'''
        self.reduced_model = models.Sequential()
        self.reduced_model.add(layers.Dense(32, activation='relu', input_shape=(self.NB_WORDS,)))
        self.reduced_model.add(layers.Dense(2, activation='softmax'))
        
    def reg_model_fun(self):
        '''adding regularization in the base model'''
        self.reg_model = models.Sequential()
        self.reg_model.add(layers.Dense(64, kernel_regularizer=regularizers.l2(0.001), activation='relu', input_shape=(self.NB_WORDS,)))
        self.reg_model.add(layers.Dense(64, kernel_regularizer=regularizers.l2(0.001), activation='relu'))
        self.reg_model.add(layers.Dense(2, activation='softmax'))
    
    def drop_model_fun(self):
        '''adding drop out layers in the base model'''
        self.drop_model = models.Sequential()
        self.drop_model.add(layers.Dense(64, activation='relu', input_shape=(self.NB_WORDS,)))
        self.drop_model.add(layers.Dropout(0.5))
        self.drop_model.add(layers.Dense(64, activation='relu'))
        self.drop_model.add(layers.Dropout(0.5))
        self.drop_model.add(layers.Dense(2, activation='softmax'))
    
    def deep_model(self,model):
        '''compile and train for 20 epochs'''
        model.compile(optimizer='adam'
                      , loss='categorical_crossentropy'
                      , metrics=['accuracy'])

        self.history = model.fit(self.X_train_rest
                           , self.y_train_rest
                           , epochs=self.NB_START_EPOCHS
                           , batch_size=self.BATCH_SIZE
                           , validation_data=(self.X_valid, self.y_valid)
                           , verbose=0)

    def eval_metric(self, metric_name):
        metric = self.history.history[metric_name]
        val_metric = self.history.history['val_' + metric_name]

        e = range(1, self.NB_START_EPOCHS + 1)

        plt.plot(e, metric, 'bo', label='Train ' + metric_name)
        plt.plot(e, val_metric, 'b', label='Validation ' + metric_name)
        plt.legend()
        plt.show()
    
    def test_model(self,model):
        '''Since, the models are overfitting, training for epochs upto which the loss graph is not increasing'''
        model.compile(optimizer='adam'
                      , loss='categorical_crossentropy'
                      , metrics=['accuracy'])
        model.fit(self.X_train_oh
                  , self.y_train_oh
                  , epochs=self.epoch_stop
                  , batch_size=self.BATCH_SIZE
                  , verbose=0)
        results = model.evaluate(self.X_test_oh, self.y_test_oh)

        #print(results)
        return results
    
    def predict_class(self,text):
        '''Function to predict sentiment class of the passed text'''

        sentiment_classes = ['Negative', 'Positive']

        # Transforms text to a sequence of integers using a tokenizer object
        xt = self.token.texts_to_sequences(text)
        ohs = np.zeros((1,10000))
        for i in xt[0] :
            ohs[0][i]= 1
        yt = self.drop_model.predict(ohs).argmax()
        print('The predicted sentiment is', sentiment_classes[yt])
        return sentiment_classes[yt]
    
    def read_data (self):
        '''function to read data and split into train and test'''
        data= pd.read_csv("airline_sentiment_analysis.csv")
        data = data[['text', 'airline_sentiment']]
        data.text = data.text.apply(self.remove_stopwords).apply(self.remove_mentions)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(data.text, data.airline_sentiment, test_size=0.1, random_state=1)
        
        
    def train(self):
        '''the pipeline to get train model'''
        self.read_data()
        self.tokenize()
        self.tokennize_to_seq()
        self.X_train_oh = self.one_hot_seq(self.X_train_seq)
        self.X_test_oh = self.one_hot_seq(self.X_test_seq) 
        self.encoders()
        self.get_valid_data()
        self.drop_model_fun()
        self.main_model= self.drop_model
        #self.deep_model(self.main_model)
        self.test_model(self.main_model)
    

In [17]:
from fastapi import FastAPI
import uvicorn
from pydantic import BaseModel

app = FastAPI()

model = Model()
model.train()
   
# Defining path operation for root endpoint
@app.get('/')
def main():
    return {'message': 'Welcome!'}
  
# Defining path operation for /name endpoint
@app.post('/predict')
def predict(data : str):
    # Making the data in a form suitable for prediction
    # Predicting the Class
     
      
    # Return the Result
    return model.predict_class([data])

