## EAST AFRICA VIRTUAL HACKATHON 2021: SWAHILI NEWS ML CHALLENGE

## Let's Get Started 

In [16]:
# import important modules
import numpy as np
import pandas as pd

# sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB # classifier 

from sklearn.metrics import log_loss #evaluation metric
from sklearn.feature_extraction.text import CountVectorizer

# text preprocessing modules
import re 
from string import punctuation 

import warnings
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)

In [17]:
# load data
path = ''
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")
submission = pd.read_csv(path+"sample_submission.csv")

In [18]:
# show top five rows of train data
train.head() 

Unnamed: 0,id,content,category
0,SW4670,"Bodi ya Utalii Tanzania (TTB) imesema, itafan...",uchumi
1,SW30826,"PENDO FUNDISHA-MBEYA RAIS Dk. John Magufuri, ...",kitaifa
2,SW29725,Mwandishi Wetu -Singida BENKI ya NMB imetoa ms...,uchumi
3,SW20901,"TIMU ya taifa ya Tanzania, Serengeti Boys jan...",michezo
4,SW12560,Na AGATHA CHARLES – DAR ES SALAAM ALIYEKUWA K...,kitaifa


In [19]:
# show top five rows of test data
test.head()

Unnamed: 0,id,content
0,SW4255,WAZIRI MKUU Kassim Majaliwa amep okea leseni ...
1,SW15677,RAIS John Magufuli amewataka viongozi wa Halm...
2,SW15925,"NEW YORK, MAREKANI MKALI wa hip hop nchini Mar..."
3,SW7615,"WAZIRI wa Kilimo, Dk Charles Tizeba amelitaka..."
4,SW28011,"Mwandishi wetu, Tanga WAFANYABIASHARA wa Mkoa ..."


In [20]:
# show top five rows of submision file
submission.head()

Unnamed: 0,test_id,kitaifa,michezo,burudani,uchumi,kimataifa,afya
0,SW4255,1.0,0.0,0.0,0.0,0.0,0.0
1,SW15677,0.0,0.0,0.0,1.0,0.0,0.0
2,SW15925,,,,,,
3,SW7615,,,,,,
4,SW28011,,,,,,


In [21]:
# check the shape of the train data
train.shape

(23268, 3)

In [22]:
# check the shape of the test data
test.shape

(7756, 2)

In [23]:
# check missing values in train data
train.isnull().sum()

id          0
content     0
category    0
dtype: int64

In [24]:
# check missing values in test data
test.isnull().sum()

id         0
content    0
dtype: int64

In [25]:
# evalute news category distribution
train.category.value_counts()

kitaifa      10242
michezo       6004
burudani      2229
uchumi        2028
kimataifa     1906
afya           859
Name: category, dtype: int64

### Data Preparation 

In [27]:
# a mapping dictionary that maps the category values from 0 to 5
category_mapping = {
"kitaifa": 0,
"michezo": 1,
"burudani": 2,
"uchumi": 3,
"kimataifa": 4,
"afya": 5
}

train["category"] = train.category.map(category_mapping)

train.head()

Unnamed: 0,id,content,category
0,SW4670,"Bodi ya Utalii Tanzania (TTB) imesema, itafan...",
1,SW30826,"PENDO FUNDISHA-MBEYA RAIS Dk. John Magufuri, ...",
2,SW29725,Mwandishi Wetu -Singida BENKI ya NMB imetoa ms...,
3,SW20901,"TIMU ya taifa ya Tanzania, Serengeti Boys jan...",
4,SW12560,Na AGATHA CHARLES – DAR ES SALAAM ALIYEKUWA K...,


In [30]:
# a simple function to clean text data 

def text_cleaning(text):
    # Clean the text data

    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
    text = text.lower()  # set in lowercase 
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
        
    # Return a list of words
    return(text)

In [31]:
#clean the train and test data
train["content"] = train["content"].apply(text_cleaning)
test["content"] = test["content"].apply(text_cleaning)

In [None]:
#split features and target from train data 
X = train["content"]
y = train.category.values

In [32]:
# Transform text data 
vectorizer = CountVectorizer(lowercase=False)

vectorizer.fit(X)

#transform train data 
X_transformed = vectorizer.transform(X)

#transform test data
test_transformed = vectorizer.transform(test["content"])

In [33]:
# split data into train and validate

X_train, X_valid, y_train, y_valid = train_test_split(
    X_transformed,
    y,
    test_size=0.20,
    random_state=42,
    shuffle=True,
    stratify=y,
)

### Create Classifier 
We will use a simple averaging technique where we use two models and average the output of the models

# Model_#1:  GBoost Classifier

In [41]:
from sklearn.ensemble import GradientBoostingClassifier
GBoost = GradientBoostingClassifier(n_estimators=1597, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, random_state =5)
#Fit the model on the whole dataset
GBoost.fit(X_transformed, y)

# create prediction for the GBoost model
test_probas0 = GBoost.predict_proba(test_transformed)

# Model_#2: XGBoost Classifier

In [45]:
from xgboost import  XGBClassifier
XGB = XGBClassifier(objective ='binary:logistic')

#Fit the model on the whole dataset
XGB.fit(X_transformed, y)

# create prediction for the XGBoost model
test_probas1 = XGB.predict_proba(test_transformed)

## Simple averaging

We now will use a simple averaging technique to average the outputs of these two models

In [51]:
test_probas = test_probas0*0.5 + test_probas1*0.5

### Create Submission File
We now can create a submission file 

In [52]:

# create submission file 
submission_cols = ['kitaifa', 'michezo', 'burudani','uchumi', 'kimataifa', 'afya'] 
submission_df = pd.DataFrame(test_probas, columns = submission_cols)
submission_df['test_id'] = submission['test_id']   # add  test_id 

#rearange columns 
submission_df = submission_df[['test_id','kitaifa', 'michezo', 'burudani','uchumi', 'kimataifa', 'afya']]

# save submission file 
submission_df.to_csv(path+"fine_submission.csv",index=False) 

Now upload your first submssion file on the hackathon page 👍