In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
queries, answers = [], []
with open('req_ans_learn.tsv') as f:
    for line in f:
        tokens = line.strip().split('\t')
        queries.append(tokens[0])
        answers.append(tokens[1])

In [3]:
len(queries), len(answers)

(500000, 500000)

In [4]:
queries[:5], answers[:5]

(['麒麟倶楽部',
  'data integrity что это',
  'a harmonious rejuvenation Add to your retreat experience by visiting the Aberdeen beachside the august Balmoral Castle and one of the most beautiful Scottish sites the Royal Deeside Benefit from the city s largest hotel',
  'Can Can',
  'all the quiet'],
 ['/wiki/Qilin',
  '/wiki/Data_integrity',
  '/wiki/Balmoral_Castle',
  '/wiki/Can-can',
  '/wiki/All_Quiet_on_the_Western_Front'])

In [5]:
!head req_ans_test_no_url.tsv

there are two rivers in the city: the Selenga
biriani spice
бернли команда
he 114
википедия про мегатрона из кино 3 википедия
Новый Орлеан рок группы начала 00s
the bella twins
d.c. in English
grade4 аналоги
Трэвис, Мерл


In [6]:
test_queries = []
with open('req_ans_test_no_url.tsv') as f:
    for line in f:
        test_queries.append(line)

In [7]:
len(test_queries)

141371

In [8]:
for i in range(len(queries)):
    queries[i] = queries[i].lower()
for i in range(len(test_queries)):
    test_queries[i] = test_queries[i].lower()

In [9]:
tokenizer = WordPunctTokenizer()

In [10]:
queries = tokenizer.tokenize_sents(queries)
test_queries = tokenizer.tokenize_sents(test_queries)

In [11]:
queries = [' '.join(s) for s in queries]
test_queries = [' '.join(s) for s in test_queries]

In [12]:
queries[:5]

['麒麟倶楽部',
 'data integrity что это',
 'a harmonious rejuvenation add to your retreat experience by visiting the aberdeen beachside the august balmoral castle and one of the most beautiful scottish sites the royal deeside benefit from the city s largest hotel',
 'can can',
 'all the quiet']

In [13]:
test_queries[:5]

['there are two rivers in the city : the selenga',
 'biriani spice',
 'бернли команда',
 'he 114',
 'википедия про мегатрона из кино 3 википедия']

In [14]:
vectorizer = TfidfVectorizer(stop_words='english')

In [15]:
vectorizer.fit(queries + test_queries)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [16]:
len(vectorizer.vocabulary_)

288625

In [17]:
train_vectors = vectorizer.transform(queries)

In [18]:
train_vectors.shape

(500000, 288625)

In [19]:
test_vectors = vectorizer.transform(test_queries)

In [20]:
test_vectors.shape

(141371, 288625)

In [21]:
from sklearn.neighbors import KNeighborsClassifier

In [22]:
model = KNeighborsClassifier(n_neighbors=10, weights='uniform', metric='cosine', n_jobs=-1)
model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
           weights='uniform')

In [23]:
model.fit(train_vectors, answers)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
           weights='uniform')

In [24]:
preds = model.predict(test_vectors)

In [25]:
preds[:5]

array(['/wiki/Chicago_River', '/wiki/Spice_Up_Your_Life',
       '/wiki/Burnley_F.C.', '/wiki/1080p', '/wiki/Wikipedia'],
      dtype='<U115')

In [31]:
subm = pd.read_csv('sample.csv')
subm.head()

Unnamed: 0,Id,Category
0,1,/wiki/
1,2,/wiki/
2,3,/wiki/
3,4,/wiki/
4,5,/wiki/


In [None]:
subm.Category = preds