In [None]:
import numpy as np
import pandas as pd
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('stopwords')
nltk.download('vader_lexicon')
from sklearn.metrics import f1_score
import time

In [None]:
def decontracted(text):
    '''Funtion to expand the sentences which are in short forms'''
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

In [None]:
def remove_special_chars(text):
  '''This function removes the special chars from the text'''
  text = re.sub('[^A-Za-z0-9]+', ' ', text)
  text=text.lower()
  return text

In [None]:
stopWords=stopwords.words('english')
stemmer=SnowballStemmer('english')

#removing no,nor and not words from the english stopwords
stopWords.remove('not')
stopWords.remove('no')
stopWords.remove('nor')

In [None]:
def remove_stopwords(text):
  '''This function removes the stopwords from the text'''
  text=[word for word in text.split() if not word in stopWords]
  text=' '.join(text)
  return text

In [None]:
def stemming(text):
  '''This function is to do stemming on words of text'''
  text=' '.join([stemmer.stem(word) for word in text.split()])
  return text

In [None]:
def preprocess_text(text):
  '''This function does all the text preprocessing steps and return a clean text'''
  text=decontracted(text)
  text=remove_special_chars(text)
  text=remove_stopwords(text)
  text=stemming(text)
  return text

In [None]:
def get_embedding_features(data,word_embeddings,model_words):
  '''This function takes dataframe as input and returns fasttext vecotr respresent of text data(Description)'''
  vector_rep=[]
  preprocessed_descriptions = data['Description'].values
  for text in preprocessed_descriptions: # For each description
    vector=np.zeros(300)
    n=0
    for word in text.split():# For each word in vector
      if (word in model_words):
        vec=word_embeddings[word] #Getting the word's w2v representation
        vector+=vec
        n+=1
    if n!=0:
      vector/=n
    vector_rep.append(vector)
  return np.array(vector_rep)

In [None]:
def get_word_char_lengths(data):
  '''This function takes input dataframe and return with length of text by wordlevel and characterleve'''
  length_features=[]
  for index,row in data.iterrows():
    text=row['Description']
    length_wordlevel=len(text.split()) # Getting the number of words
    len_charlevel=len(text) # Getting the number characters including spaces
    length_features.append([length_wordlevel,len_charlevel])
  return pd.DataFrame(length_features,columns=['length_word_level','length_char_level'])

In [None]:
sid = SentimentIntensityAnalyzer()
def sentiment_score(data):
  '''This function takes dataframe as input and returns sentiment scores of text data'''
  sentiments=[]
  preprocessed_descriptions = data['Description'].values
  for text in preprocessed_descriptions:
    polarities=sid.polarity_scores(text) # Getting the sentiment scores of text
    sentiments.append(list(polarities.values()))
  return pd.DataFrame(sentiments,columns=['negative','neutral','positive','compound'])

In [None]:
#Loading the models and files required

model = pickle.load(open('lightgbm.pkl','rb'))

scalar=pickle.load(open('scalar.pkl','rb'))

glove_word_embeddings = pickle.load(open('glove_word_embeddings.pkl','rb'))
glove_words=glove_word_embeddings.keys()

In [None]:
def get_vector_representation(query):
  pre_query=preprocess_text(query)
  qd=pd.DataFrame([pre_query],columns=['Description'])
  quer_fasttext=get_embedding_features(qd,glove_word_embeddings,glove_words)
  quer_fasttext=pd.DataFrame(quer_fasttext,columns=['embed_'+str(i) for i in range(300)])
  query_lengths=get_word_char_lengths(qd)
  qeury_sentiments=sentiment_score(qd)

  vector=pd.concat([quer_fasttext,query_lengths,qeury_sentiments],axis=1)

  vector=vector.values[0]
  return vector

In [None]:
(get_vector_representation('leav')[:300]==np.zeros(300)).sum()

300

In [None]:
def final_fun_1(queries):

  queries_vec=[]
  for query in queries:
    vector=get_vector_representation(query)
    queries_vec.append(vector)
  queries_vec=np.asarray(queries_vec,)
  queries_vec[:,300:]=scalar.transform(queries_vec[:,300:])

  return model.predict(queries_vec)

In [None]:
start = time.time()

query=['''Catcalls and passing comments were two of the ghastly things the Delhi police at the International Airport put me and 
            my friend through. It is appalling that the protectors and law enforcers at the airport can make someone so uncomfortable.''',
            '''Some people used to stare in a very inappropriate way which is not tolerable.It happened in the morning and Night.''']

predictions=final_fun_1(query)

labels=['Commenting','Ogling/Facial Expressions/Staring','Touching /Groping']

for index,prediction in enumerate(predictions):
  print('Query ',str(index),': ',end= '')
  for i,val in enumerate(prediction):
    if val==1:
      print(labels[i],end=', ')
  print('',end='\n')


end = time.time()

print(f"\n \nRuntime of the function is {end - start}")

Query  0 : Commenting, Ogling/Facial Expressions/Staring, 
Query  1 : Commenting, Touching /Groping, 

 
Runtime of the function is 0.014860391616821289


In [None]:
def final_fun_2(queries,target):

  queries_vec=[]

  for query in queries:
    vector=get_vector_representation(query)
    queries_vec.append(vector)
  queries_vec=np.asarray(queries_vec,)
  queries_vec[:,300:]=scalar.transform(queries_vec[:,300:])

  y_pred=model.predict(queries_vec)
  y_true=target
  score=f1_score(y_true,y_pred,average='macro')
  
  return score

In [None]:
start = time.time()

query=['''Catcalls and passing comments were two of the ghastly things the Delhi police at the International Airport put me and 
            my friend through. It is appalling that the protectors and law enforcers at the airport can make someone so uncomfortable.''',
            '''Some people used to stare in a very inappropriate way which is not tolerable.It happened in the morning and Night.''']

targets=np.asarray([[1,1,0],[1,0,1]])

score=final_fun_2(query,targets)

print('F1-macro : ',score)

end = time.time()

print(f"\n \nRuntime of the function is {end - start}")

F1-macro :  1.0

 
Runtime of the function is 0.020786046981811523
