In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [35]:
train_df = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Train.csv")
test_df = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Test.csv")

train_df.head()

In [33]:
print(train_df.shape)
print(test_df.shape)

In [32]:
train_df.info()
train_df.Category.unique()

In [36]:
import matplotlib.pyplot as plt

plt.hist(train_df.Category)

1. Dataset have 3 columns 'ArticleId', 'Text', 'Category'.
2. There are 1,490 samples on train and 735 samples on test.
3. There are 5 class for Category : ['business', 'tech', 'politics', 'sport', 'entertainment'] 
4. We can assume that class of dataset is stable refer above plot.

I will use really simple technique with just two sklearn function :  
NMF decomposition and TfidfVectorizer  

There are many techniques to deal text data, but in this project  
Understanding NMF is the most important, so I used the most simple technique.

In [37]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df=50, stop_words='english')
X = vect.fit_transform(train_df.Text)

model = NMF(n_components=5, random_state=5)
model.fit(X)
nmf_features = model.transform(X)

In [38]:
print(X.shape)
print(nmf_features.shape)
print(model.components_.shape)

In [39]:
components_df = pd.DataFrame(model.components_, columns=vect.get_feature_names())
components_df

In [40]:
for topic in range(components_df.shape[0]):
    tmp = components_df.iloc[topic]
    print(f'For topic {topic+1} the words with the highest value are:')
    print(tmp.nlargest(10))
    print('\n')

Refer above result we can assume as below :

topic 1 : politics  
topic 2 : sport  
topic 3 : tech  
topic 4 : entertainment  
topic 5 : business  

In [41]:
pred_class = pd.DataFrame({'id' : [0, 1, 2, 3, 4], 'category' : ['politics', 'sport', 'tech', 'entertainment', 'business']})

train_pred = pd.DataFrame({'id':pd.DataFrame(nmf_features).idxmax(axis=1)})
train_pred.merge(pred_class, how = 'left', on = 'id').category

In [42]:
train_df['Pred'] = train_pred.merge(pred_class, how = 'left', on = 'id').category
(train_df['Pred'] == train_df['Category']).mean()

Accuracy for train set is 87.2%

In [43]:
# Transform the TF-IDF
text_X = vect.transform(test_df.Text)
# Transform the TF-IDF: nmf_features
test_nmf_features = model.transform(text_X)
pd.DataFrame(test_nmf_features).idxmax(axis=1)

In [44]:
pred = pd.DataFrame({'id':pd.DataFrame(test_nmf_features).idxmax(axis=1)})
pred.merge(pred_class, how = 'left', on = 'id').category

In [45]:
test_df['category'] = pred.merge(pred_class, how = 'left', on = 'id').category

In [None]:
test_df.drop(['Text'], axis = 1).to_csv('submission.csv', index = False)

Accuracy for test set is 87.7%  
There is no overfit for this model and accuracy is quite good,
thus I can accept this model.