# HackerRank - Guess the Flipkart Query

#### Problem Statement: https://www.hackerrank.com/challenges/guess-the-flipkart-query/problem

In [1]:
import numpy as np
import copy

from nltk import word_tokenize
from nltk import ngrams
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
f = open('training.txt','r')
lines = int(f.readline())
text = []
labels = []
for i in range(lines):
    inp = str(f.readline())
    ind = inp.index('\t')
    labels.append(inp[ind+1:])
    text.append(inp[:ind])

In [3]:
stop_words = stopwords.words('english')

# Preprocessing

In [4]:
def convert_lower_case(data):
    return np.char.lower(data)

In [5]:
def remove_stop_words(data):
    words = str(data).split(' ')
    new_text = ""
    for w in words:
        if w not in stop_words:
            new_text = new_text + " " + w
    return np.char.strip(new_text)

In [6]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [7]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [8]:
def remove_single_characters(data):
    words = str(data).split()
    new_text = ""
    for w in words:
        if len(w) > 1:
            new_text = new_text + " " + w
    return np.char.strip(new_text)

In [9]:
def convert_numbers(data):
    data = np.char.replace(data, "0", " zero ")
    data = np.char.replace(data, "1", " one ")
    data = np.char.replace(data, "2", " two ")
    data = np.char.replace(data, "3", " three ")
    data = np.char.replace(data, "4", " four ")
    data = np.char.replace(data, "5", " five ")
    data = np.char.replace(data, "6", " six ")
    data = np.char.replace(data, "7", " seven ")
    data = np.char.replace(data, "8", " eight ")
    data = np.char.replace(data, "9", " nine ")
    return data

In [10]:
def preprocess(data):
    data = convert_lower_case(data)
    data = convert_numbers(data)
    data = remove_punctuation(data) #remove comma seperately
#     print(data)
    data = remove_stop_words(data)
    data = remove_apostrophe(data)
    data = remove_single_characters(data)
    return data

In [11]:
def processes_arr(text):
    preprocessed_text = []
    for t in text:
        preprocessed_text.append(preprocess(t))
    return preprocessed_text

In [12]:
def make_grams(data, n = 1):
    grammed_data = []
    for i in data:
        k = copy.deepcopy(str(i))
        for r in range(2,n+1):
            sixgrams = ngrams(str(i).split(), r)
            for grams in sixgrams:
                g = ""
                for p in grams:
                    g = g+p
                k = k+" "+g
        grammed_data.append(k)
    return grammed_data

In [13]:
def gen_grammed(text):
    preprocessed_text = processes_arr(text)
    grammed_data = make_grams(preprocessed_text)
    return grammed_data

In [14]:
grammed_data = gen_grammed(text)

In [15]:
vectorizer = CountVectorizer() 
vectorizer.fit(grammed_data)
vector = vectorizer.transform(grammed_data)
feature_vector = vector.toarray()

# Train

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
clf = LogisticRegression()

In [18]:
clf.fit(feature_vector, labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
clf.score(feature_vector, labels)

0.990990990990991

# Test

In [20]:
f = open('sampleInput.txt','r')
lines = int(f.readline())
test_text = []
for i in range(lines):
    inp = str(f.readline())
    test_text.append(inp)

f = open('sampleOutput.txt','r')
test_labels = []
for i in range(lines):
    inp = str(f.readline())
    test_labels.append(inp)

In [21]:
test_grammed = gen_grammed(test_text)

In [22]:
vector = vectorizer.transform(test_grammed)
test_feature_vector = vector.toarray()

In [23]:
vals = clf.predict(test_feature_vector)

In [24]:
clf.score(test_feature_vector, test_labels)

0.7666666666666667

In [25]:
# Output for HackerRank
# lines = int(input())
# test_text = []
# for i in range(lines):
#     inp = str(input())
#     test_text.append(inp)
# for i in vals:
#     print(i[:len(i)-1])