# Labelling dataset

## Load libraries and dataset

In [50]:
import pandas as pd
import numpy as np

import tensorflow_hub as hub
import tensorflow as tf

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [52]:
data = pd.read_csv('/content/drive/MyDrive/Assignments/NLP - Social Media/Unlabelled_text_V2.csv')
data.head()

Unnamed: 0,link,text,label,site
0,https://www.skysports.com/mma/news/36350/12389...,Manchester MMA fighter Brendan Loughnane 21-3-...,mma,SKY
1,https://www.skysports.com/mma/news/36350/12385...,Former PFL women s lightweight champion Kayla ...,mma,SKY
2,https://www.skysports.com/mma/news/19828/12384...,Kayla Harrison is a perfectionist inside the c...,mma,SKY
3,https://www.skysports.com/mma/news/19828/12376...,Ciryl Gane stopped Derrick Lewis with a flurry...,mma,SKY
4,https://www.skysports.com/mma/news/36350/12373...,As Manchester's featherweight contender Brenda...,mma,SKY


In [53]:
data.shape

(21594, 4)

In [54]:
data.drop(['link','site'],axis=1,inplace=True)

In [55]:
print(data.shape)
data.head()

(21594, 2)


Unnamed: 0,text,label
0,Manchester MMA fighter Brendan Loughnane 21-3-...,mma
1,Former PFL women s lightweight champion Kayla ...,mma
2,Kayla Harrison is a perfectionist inside the c...,mma
3,Ciryl Gane stopped Derrick Lewis with a flurry...,mma
4,As Manchester's featherweight contender Brenda...,mma


In [56]:
data.groupby('label').size()

label
baseball        272
basketball      158
boxing          135
cricket         241
f1              130
football        247
golf            101
hockey           74
mma              31
netball         109
nonsport       1229
other_sport     621
rugby           205
soccer          443
tennis          155
dtype: int64

## USE

In [57]:
embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')

In [58]:
embeddings1 = embed(data.text.astype(str)[:4000])
embeddings2 = embed(data.text.astype(str)[4000:8000])
embeddings3 = embed(data.text.astype(str)[8000:12000])
embeddings4 = embed(data.text.astype(str)[12000:16000])
embeddings5 = embed(data.text.astype(str)[16000:20000])
embeddings6 = embed(data.text.astype(str)[20000:])

In [59]:
X = np.concatenate((embeddings1, embeddings2, embeddings3, embeddings4, embeddings5, embeddings6))

In [60]:
X.shape

(21594, 512)

## Train test split

In [61]:
data_train  = data.loc[data.label.isna()==False,:]
X_train = X[data.label.isna()==False]
y_train = data.label[data.label.isna()==False]
print(data_train.shape, X_train.shape, y_train.shape)

(4151, 2) (4151, 512) (4151,)


In [62]:
data_test  = data.loc[data.label.isna(),:]
X_test = X[data.label.isna()]
y_test = data.label[data.label.isna()]
print(data_test.shape, X_test.shape, y_test.shape)

(17443, 2) (17443, 512) (17443,)


## KNN

In [63]:
knn = KNeighborsClassifier()

parameters = {
    "n_neighbors": np.arange(3, 30, 1)
}

# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(knn, parameters, scoring=acc_scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)

In [64]:
#Best number of neighbours
grid_obj.best_params_

{'n_neighbors': 8}

In [66]:
results = pd.DataFrame(grid_obj.cv_results_ )

In [72]:
print('Mean cross-validation accuracy for knn: ',results.loc[results['param_n_neighbors']==8,'mean_test_score'].values[0])

Mean cross-validation accuracy for knn:  0.8778556826584316


In [73]:
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=8, p=2,
                     weights='uniform')

In [74]:
knn.score(X_train,y_train)

0.9339918092026018

## Predict

In [75]:
y_test = knn.predict(X_test)

In [76]:
y_test[:10]

array(['baseball', 'soccer', 'football', 'baseball', 'football',
       'football', 'soccer', 'mma', 'baseball', 'football'], dtype=object)

In [77]:
pd.Series(y_test).value_counts()

other_sport    7065
football       1978
baseball       1693
soccer         1649
nonsport       1107
cricket         803
basketball      770
rugby           661
boxing          428
tennis          336
f1              328
netball         210
golf            175
hockey          155
mma              85
dtype: int64

## Save dataset

In [78]:
text = np.concatenate((data_train.text,data_test.text))
print(text[:2])
text.shape

['Manchester MMA fighter Brendan Loughnane 21-3-0 takes on Movlid Khaybulaev with dreams of becoming featherweight PFL champion and taking away 1m 730,000 in prize money. Fighters across the featherweight and light heavyweight divisions will be aiming to make it out of the final four and move one step closer to becoming a PFL champion and capturing a life-changing amount of money. Loughnane and Russian Khaybulaev will clash in a featherweight contest on Friday, with each fighter looking for their first PFL Championship berth. Please use Chrome browser for a more accessible video player Brendan Loughnane clinched a spot in the PFL playoffs with a hard fought majority decision over Tyler Diamond "As mad as it sounds I\'m not even looking at the money," Loughnane told Sky Sports\' Ed Draper. "To be a world champion is my goal. "Money comes and goes in your life and it always will do, but it\'s all about that piece of gold which I\'ve worked 13 years to get anywhere near this. I was 70-1 t

(21594,)

In [79]:
lab = np.concatenate((y_train, y_test))
print(lab[:10])
lab.shape

['mma' 'mma' 'mma' 'mma' 'mma' 'mma' 'mma' 'mma' 'mma' 'mma']


(21594,)

In [80]:
final = pd.DataFrame()
final['text'] = text
final['label'] = lab
final.head()

Unnamed: 0,text,label
0,Manchester MMA fighter Brendan Loughnane 21-3-...,mma
1,Former PFL women s lightweight champion Kayla ...,mma
2,Kayla Harrison is a perfectionist inside the c...,mma
3,Ciryl Gane stopped Derrick Lewis with a flurry...,mma
4,As Manchester's featherweight contender Brenda...,mma


In [81]:
final.shape

(21594, 2)

In [82]:
final.groupby('label').size()

label
baseball       1965
basketball      928
boxing          563
cricket        1044
f1              458
football       2225
golf            276
hockey          229
mma             116
netball         319
nonsport       2336
other_sport    7686
rugby           866
soccer         2092
tennis          491
dtype: int64

In [85]:
final.to_csv('/content/drive/MyDrive/Labelled_text.csv',index=False)