### Importing the libraries


In [52]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

### Loading the Train dataset


In [53]:
train_data=pd.read_excel('train.xlsx')

In [54]:
train_data

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2210,politics,teens know little of politics teenagers ques...
2211,entertainment,lopez misses uk charity premiere jennifer lope...
2212,business,christmas shoppers flock to tills shops all ov...
2213,tech,progress on new internet domains by early 2005...


In [55]:
train_data['category'].value_counts()

category
sport            509
business         508
politics         415
tech             399
entertainment    384
Name: count, dtype: int64

### Loading the Test dataset


In [56]:
test_data=pd.read_excel('test.xlsx')

In [57]:
test_data

Unnamed: 0,category,text
0,,junk e-mails on relentless rise spam traffic i...
1,,top stars join us tsunami tv show brad pitt r...
2,,rings of steel combat net attacks gambling is ...
3,,davies favours gloucester future wales hooker ...
4,,beijingers fume over parking fees choking traf...
5,,cars pull down us retail figures us retail sal...
6,,kilroy unveils immigration policy ex-chatshow ...
7,,rem announce new glasgow concert us band rem h...
8,,how political squabbles snowball it s become c...
9,,souness delight at euro progress boss graeme s...


### Generating the Hash Tables from the train data

In [58]:
# presetting the attributes
np.random.seed(0)
min_df = 10
max_features = 4000
ngram_range = (2, 3)

# converting the text to vector using TFIDF vectorizer
text = train_data['text']
tfidf = TfidfVectorizer(ngram_range=ngram_range,
                        max_features=max_features,
                        min_df=min_df)
text_vector = tfidf.fit_transform(text)
text_vec = text_vector.toarray()

# creating a dictionary to store index of the vector and the corresponding category
category_dict = {}
for i in range(0, text_vec.shape[0]):
    category_dict.update({i: train_data['category'][i]})

# creating hyperplanes
m = 5
hyp = []
for i in range(m):
    hyp.append([])
    for j in range(0, max_features):
        weights = np.random.normal(loc=0, scale=1)
        hyp[i].append(weights)
hyp_arr = np.array(hyp)
#print("weights of hyperplanes:", hyp_arr)

# creating a dictionary(hash table) to store hash keys and hash values
hash_table = {}
for i in range(0, text_vec.shape[0]):
    hash_values = np.dot(hyp_arr, text_vec[i])
    hash_key_list = hash_values.tolist()
    hash_key = tuple([1 if i > 0 else -1 for i in hash_key_list])

    if hash_key in hash_table:
        hash_table[hash_key].append(i)

    if hash_key not in hash_table:
        lst = []
        lst.append(i)
        hash_table.update({hash_key: lst})

#print("hash table:", hash_table)

### Predicting the category labels for the test dataset


In [61]:
def predictLabels(test_data):
    # loading and converting test data to make queries
    query_text = test_data['text']
    query_vector = tfidf.transform(query_text)
    query_vec = query_vector.toarray()

    # creating a list to store predictions of each query
    predictions = []
    for i in range(0, query_vec.shape[0]):
        q_hash_value = np.dot(hyp_arr, query_vec[i])
        q_hash_key_list = q_hash_value.tolist()
        q_hash_key = tuple([1 if i > 0 else -1 for i in q_hash_key_list])
        # finding the hash key of the query point from the hash table
        for key, values in hash_table.items():
            if key == q_hash_key:
                indx_vals = values
        # extracting the indices and calculating the cosine similarity between the query vector and each text vector belonging to the hash key
        cosine = {}
        for j in indx_vals:
            vector = text_vec[j]
            cosine_sim = np.dot(vector, query_vec[i])
            cosine.update({j: cosine_sim})

        # finding the k-Nearest Neighbours(11-NN)
        val_sort = dict(sorted(cosine.items(), key=lambda item: item[1]))
        knn = list(val_sort)[::-1][:11]

        # extracting the category using the dictionary
        categories = []
        for k in knn:
            category = category_dict[k]
            categories.append(category)

        # determining the majority category of the nearest neighbours
        counts = Counter(categories)
        cnt_dict = dict(counts)
        key_sort = dict(sorted(cnt_dict.items(), key=lambda item: item[0]))
        prediction = max(key_sort, key=key_sort.get)
        predictions.append(prediction)
    
    # For displaying the predictions of the corresponding text queries
    for i, row in test_data.iterrows():
        print(f"Text: {row['text']}, Prediction: {predictions[i]}")



### Output

In [60]:
# Calling the function to predict category for each of the queries
predictLabels(test_data)

Text: junk e-mails on relentless rise spam traffic is up by 40%  putting the total amount of e-mail that is junk up to an astonishing 90%.  the figures  from e-mail management firm email systems  will alarm firms attempting to cope with the amount of spam in their in-boxes. while virus traffic has slowed down  denial of service attacks are on the increase according to the firm. virus mail accounts for just over 15% of all e-mail traffic analysis by the firm has found.  it is no longer just multi-nationals that are in danger of so-called denial of service attacks  in which websites are bombarded by requests for information and rendered inaccessible. email systems refers to a small uk-based engineering firm  which received a staggering 12 million e-mails in january. the type of spam currently being sent has subtlety altered in the last few months  according to email systems analysis. half of spam received since christmas has been health-related with gambling and porn also on the increase