In [1]:
%run -i "../util/util_simple_classifier.ipynb"

In [2]:
from sklearn.svm import SVC
from sentence_transformers import SentenceTransformer
from sklearn.metrics import confusion_matrix

In [3]:
train_df = pd.read_json("../data/bbc_train.json")
test_df = pd.read_json("../data/bbc_test.json")
train_df.sample(frac=1)
train_df.head()

Unnamed: 0,text,label,label_text,text_tokenized,text_clean
0,wales want rugby league training wales could f...,2,sport,"[wales, want, rugby, league, training, wales, ...",wales want rugby league training wales could f...
1,china aviation seeks rescue deal scandal-hit j...,1,business,"[china, aviation, seeks, rescue, deal, scandal...",china aviation seeks rescue deal scandal-hit j...
2,rock band u2 break ticket record u2 have smash...,3,entertainment,"[rock, band, u2, break, ticket, record, u2, sm...",rock band u2 break ticket record u2 smashed ir...
3,markets signal brazilian recovery the brazilia...,1,business,"[markets, signal, brazilian, recovery, brazili...",markets signal brazilian recovery brazilian st...
4,tough rules for ringtone sellers firms that fl...,0,tech,"[tough, rules, ringtone, sellers, firms, flout...",tough rules ringtone sellers firms flout rules...


In [6]:
print(train_df.groupby('label').count())
print(test_df.groupby('label').count())

       text  label_text  text_tokenized  text_clean
label                                              
0       321         321             321         321
1       408         408             408         408
2       409         409             409         409
3       309         309             309         309
4       333         333             333         333
       text  label_text  text_tokenized  text_clean
label                                              
0        80          80              80          80
1       102         102             102         102
2       102         102             102         102
3        77          77              77          77
4        84          84              84          84


In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')
def get_sentence_vector(text, model):
    sentence_embeddings = model.encode([text])
    return sentence_embeddings[0]

In [8]:
def train_classifier(X_train, y_train):
    classifier = SVC(C=0.1, kernel='rbf')
    classifier.fit(X_train, y_train)
    return classifier

In [12]:
target_names = ["tech", "business", "sport", "entertainment", "politics"]
vectorize = lambda text: get_sentence_vector(text, model)
(X_train, X_test, y_train, y_test) = create_train_test_data(
    train_df, test_df, vectorize, column_name="text_clean"
)
clf = train_classifier(X_train, y_train)


In [11]:
print(classification_report(train_df["label"], y_train, target_names=target_names))
print(classification_report(test_df["label"], clf.predict(X_test), target_names=target_names))

               precision    recall  f1-score   support

         tech       1.00      1.00      1.00       321
     business       1.00      1.00      1.00       408
        sport       1.00      1.00      1.00       409
entertainment       1.00      1.00      1.00       309
     politics       1.00      1.00      1.00       333

     accuracy                           1.00      1780
    macro avg       1.00      1.00      1.00      1780
 weighted avg       1.00      1.00      1.00      1780

               precision    recall  f1-score   support

         tech       0.97      0.95      0.96        80
     business       0.98      0.97      0.98       102
        sport       0.98      1.00      0.99       102
entertainment       0.96      0.99      0.97        77
     politics       0.98      0.96      0.97        84

     accuracy                           0.98       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.98      0.98      0.98       445



In [13]:
print(confusion_matrix(test_df["label"], clf.predict(X_test)))

[[ 76   0   1   2   1]
 [  1  99   1   1   0]
 [  0   0 102   0   0]
 [  0   0   0  76   1]
 [  1   2   0   0  81]]


In [14]:
new_example = """iPhone 12: Apple makes jump to 5G
Apple has confirmed its iPhone 12 handsets will be its first to work on faster 5G networks. 
The company has also extended the range to include a new "Mini" model that has a smaller 5.4in screen. 
The US firm bucked a wider industry downturn by increasing its handset sales over the past year. 
But some experts say the new features give Apple its best opportunity for growth since 2014, when it revamped its line-up with the iPhone 6. 
"5G will bring a new level of performance for downloads and uploads, higher quality video streaming, more responsive gaming, 
real-time interactivity and so much more," said chief executive Tim Cook. 
There has also been a cosmetic refresh this time round, with the sides of the devices getting sharper, flatter edges. 
The higher-end iPhone 12 Pro models also get bigger screens than before and a new sensor to help with low-light photography. 
However, for the first time none of the devices will be bundled with headphones or a charger. 
Apple said the move was to help reduce its impact on the environment. "Tim Cook [has] the stage set for a super-cycle 5G product release," 
commented Dan Ives, an analyst at Wedbush Securities. 
He added that about 40% of the 950 million iPhones in use had not been upgraded in at least three-and-a-half years, presenting a "once-in-a-decade" opportunity. 
In theory, the Mini could dent Apple's earnings by encouraging the public to buy a product on which it makes a smaller profit than the other phones. 
But one expert thought that unlikely. 
"Apple successfully launched the iPhone SE in April by introducing it at a lower price point without cannibalising sales of the iPhone 11 series," noted Marta Pinto from IDC. 
"There are customers out there who want a smaller, cheaper phone, so this is a proven formula that takes into account market trends." 
The iPhone is already the bestselling smartphone brand in the UK and the second-most popular in the world in terms of market share. 
If forecasts of pent up demand are correct, it could prompt a battle between network operators, as customers become more likely to switch. 
"Networks are going to have to offer eye-wateringly attractive deals, and the way they're going to do that is on great tariffs and attractive trade-in deals," 
predicted Ben Wood from the consultancy CCS Insight. Apple typically unveils its new iPhones in September, but opted for a later date this year. 
It has not said why, but it was widely speculated to be related to disruption caused by the coronavirus pandemic. The firm's shares ended the day 2.7% lower. 
This has been linked to reports that several Chinese internet platforms opted not to carry the livestream, 
although it was still widely viewed and commented on via the social media network Sina Weibo."""
vector = vectorize(new_example)
prediction = clf.predict([vector])
print(prediction)

[0]
