In [1]:
import pandas as pd
from tqdm import tqdm
import pickle
import numpy as np
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data_path='./data/'
training_filename = data_path + 'training_data.txt'
print('Loading training dataset...')
train_data = pd.read_table(training_filename, delimiter='*')
train_data.drop(columns=['image_names'],inplace=True)
# train_data = train_data.values.tolist()
print(len(train_data))
x_train=train_data['tweets'].values
y_train=train_data['hashtags'].values

Loading training dataset...
51172


  after removing the cwd from sys.path.


In [3]:
validation_filename = data_path + 'validation_data.txt'
validation_data = pd.read_table(validation_filename, delimiter='*')
validation_data.drop(columns=['image_names'],inplace=True)
# validation_data = validation_data.values.tolist()
print(len(validation_data))
X_test=validation_data['tweets'].values
Y_test=validation_data['hashtags'].values

5686


  


In [4]:
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
print(x_train[0])
vectorizer.fit(x_train)
vectorizer.fit(X_test)

air feel smell fresh brew coffe


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [5]:
print(x_train.shape)
print(X_test.shape)
print(x_train[0])

(51172,)
(5686,)
air feel smell fresh brew coffe


In [6]:
x_train = vectorizer.transform(x_train)
x_test = vectorizer.transform(X_test)
print(x_train[0])

  (0, 94856)	0.4747594052546267
  (0, 94846)	0.38435796581677917
  (0, 39173)	0.32451634840457233
  (0, 34970)	0.23885810958576878
  (0, 18660)	0.3387421748758904
  (0, 12065)	0.4747594052546267
  (0, 1626)	0.3526551437613814


In [7]:
print(x_train.shape)
print(x_test.shape)

(51172, 118987)
(5686, 118987)


In [8]:
word_to_id_filename='word_to_id.p'
id_to_word_filename='id_to_word.p'
word_to_id = pickle.load(open(data_path +
                                           word_to_id_filename, 'rb'))
id_to_word = pickle.load(open(data_path +
                                           id_to_word_filename, 'rb'))

In [9]:
max_len=1003
def to_category_vector(texts,max_len):
    vector = np.zeros(max_len).astype(np.float32)
    for word in texts :
        vector[word_to_id[word]]=1.0
    return vector
document_Y = []
test_y=[]
for example in tqdm(y_train):
    arr=example.strip().split()
    document_Y.append(to_category_vector(arr,max_len))
document_Y=np.array(document_Y)
for example in tqdm(Y_test):
    arr=example.strip().split()
    test_y.append(to_category_vector(arr,max_len))
test_y=np.array(test_y)

100%|██████████| 51172/51172 [00:00<00:00, 131059.99it/s]
100%|██████████| 5686/5686 [00:00<00:00, 169777.48it/s]


In [10]:
print(document_Y[0])

[0. 0. 0. ... 0. 0. 0.]


In [11]:
%%time
# y_train = pd.DataFrame(document_Y.tolist())
classifier_new = MLkNN(k=10)
classifier_new.fit(x_train, document_Y)
predictions_new = classifier_new.predict(x_test)
print(predictions_new)

  (3, 13)	1
  (3, 26)	1
  (6, 261)	1
  (6, 334)	1
  (6, 494)	1
  (6, 954)	1
  (9, 20)	1
  (12, 106)	1
  (13, 772)	1
  (13, 965)	1
  (16, 4)	1
  (23, 3)	1
  (23, 16)	1
  (23, 18)	1
  (23, 83)	1
  (32, 18)	1
  (36, 16)	1
  (37, 5)	1
  (37, 16)	1
  (40, 513)	1
  (44, 47)	1
  (46, 3)	1
  (46, 5)	1
  (52, 5)	1
  (52, 47)	1
  (53, 347)	1
  (56, 37)	1
  (56, 38)	1
  (56, 513)	1
  (62, 3)	1
  (62, 5)	1
  (62, 8)	1
  (65, 4)	1
  (70, 414)	1
  (74, 25)	1
  (75, 4)	1
  (76, 18)	1
  (77, 271)	1
  (81, 10)	1
  (81, 61)	1
  (82, 5)	1
  (82, 6)	1
  (83, 169)	1
  (84, 5)	1
  (85, 825)	1
  (89, 5)	1
  (89, 25)	1
  (89, 37)	1
  (89, 39)	1
  (89, 43)	1
  (89, 69)	1
  (89, 78)	1
  (89, 92)	1
  (89, 108)	1
  (89, 114)	1
  (89, 140)	1
  (89, 141)	1
  (91, 165)	1
  (94, 38)	1
  (98, 10)	1
  (100, 4)	1
  (100, 5)	1
  (103, 12)	1
  (103, 13)	1
  (103, 25)	1
  (103, 35)	1
  (103, 36)	1
  (103, 37)	1
  (103, 39)	1
  (103, 43)	1
  (103, 69)	1
  (103, 78)	1
  (103, 103)	1
  (103, 108)	1
  (103, 140)	1
  (103, 148)

In [35]:
print(predictions_new.shape)
print(predictions_new[3])
print(predictions_new.getrow)

(5686, 1003)
  (0, 13)	1
  (0, 26)	1
<bound method lil_matrix.getrow of <5686x1003 sparse matrix of type '<class 'numpy.int64'>'
	with 5958 stored elements in LInked List format>>


In [21]:
pred=predictions_new.toarray()

In [29]:
print(pred.shape)
print(pred[3].tolist())

(5686, 1003)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [42]:
g=np.where(pred[0]==1)
for item in g[0]:
    print(item)
    print(id_to_word[item])
print(g)

(array([], dtype=int64),)


In [46]:
with open("predict.txt",'w') as f:
    for i in tqdm(range(pred.shape[0])):
        g=np.where(pred[i]==1)
        hashtags=[]
        for item in g[0]:
            word=id_to_word[item]
            hashtags.append(word)
        f.write(' '.join(hashtags))
        f.write('\n')

100%|██████████| 5686/5686 [00:00<00:00, 143263.65it/s]
