In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [3]:
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.multioutput import MultiOutputClassifier

In [4]:
df = pd.read_csv('data_final.csv')

In [5]:
df

Unnamed: 0,id,category,sub-category,title,abstract,clean_title
0,N55528,lifestyle,lifestyleroyals,the brands queen elizabeth prince charles and ...,shop the notebooks jackets and more that the r...,brand queen elizabeth prince charles prince p...
1,N19639,health,weightloss,50 worst habits for belly fat,these seemingly harmless habits are holding yo...,50 bad habit belly fat
2,N61837,news,newsworld,the cost of trump s aid freeze in the trenches...,lt ivan molchanets peeked over a parapet of sa...,cost trump aid freeze trench ukraine war
3,N53526,health,voices,i was an nba wife here s how it affected my me...,i felt like i was a fraud and being an nba wif...,nba wife affect mental health
4,N38324,health,medical,how to get rid of skin tags according to a der...,they seem harmless but there s a very good rea...,get rid skin tag accord dermatologist
...,...,...,...,...,...,...
48607,N17258,news,newsscienceandtechnology,realme takes chunk of india mobile market as s...,over 400 percent more phones shipped year on year,realme take chunk india mobile market samsung...
48608,N23858,sports,golf,young northeast florida fans flock to u s wome...,when the u s women s national soccer team arri...,young northeast florida fan flock u woman soc...
48609,N16909,weather,weathertopstories,adapting learning and soul searching reflectin...,woolsey fire anniversary a community is foreve...,adapt learn soul search reflect woolsey fire
48610,N7482,sports,more_sports,st dominic soccer player tries to kick cancer ...,sometimes what happens on the sidelines can be...,st dominic soccer player try kick cancer curb


In [6]:
encode_cat = LabelEncoder().fit(df['category'])
encode_sub_cat = LabelEncoder().fit(df['sub-category'])

In [7]:
df['category'] = encode_cat.transform(df['category'])
df['sub-category'] = encode_sub_cat.transform(df['sub-category'])

In [8]:
df

Unnamed: 0,id,category,sub-category,title,abstract,clean_title
0,N55528,6,138,the brands queen elizabeth prince charles and ...,shop the notebooks jackets and more that the r...,brand queen elizabeth prince charles prince p...
1,N19639,4,254,50 worst habits for belly fat,these seemingly harmless habits are holding yo...,50 bad habit belly fat
2,N61837,10,186,the cost of trump s aid freeze in the trenches...,lt ivan molchanets peeked over a parapet of sa...,cost trump aid freeze trench ukraine war
3,N53526,4,249,i was an nba wife here s how it affected my me...,i felt like i was a fraud and being an nba wif...,nba wife affect mental health
4,N38324,4,147,how to get rid of skin tags according to a der...,they seem harmless but there s a very good rea...,get rid skin tag accord dermatologist
...,...,...,...,...,...,...
48607,N17258,10,180,realme takes chunk of india mobile market as s...,over 400 percent more phones shipped year on year,realme take chunk india mobile market samsung...
48608,N23858,12,101,young northeast florida fans flock to u s wome...,when the u s women s national soccer team arri...,young northeast florida fan flock u woman soc...
48609,N16909,16,252,adapting learning and soul searching reflectin...,woolsey fire anniversary a community is foreve...,adapt learn soul search reflect woolsey fire
48610,N7482,12,151,st dominic soccer player tries to kick cancer ...,sometimes what happens on the sidelines can be...,st dominic soccer player try kick cancer curb


# MULTI CLASS - CATEGORY AND SUB-CATEGORY CLASSIFIER

In [9]:
X = df['clean_title']
y = df[['category','sub-category']]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [11]:
print('X shape: {}'.format(X.shape))
print('y shape: {}'.format(y.shape))
print('X train shape: {}'.format(X_train.shape))
print('y train shape: {}'.format(y_train.shape))
print('X test shape: {}'.format(X_test.shape))
print('y test shape: {}'.format(y_test.shape))

X shape: (48612,)
y shape: (48612, 2)
X train shape: (38889,)
y train shape: (38889, 2)
X test shape: (9723,)
y test shape: (9723, 2)


In [12]:
vector_train = TfidfVectorizer(ngram_range=(1,2)).fit(X_train)
vocab = vector_train.vocabulary_
features = vector_train.get_feature_names()
X_train_vect = vector_train.transform(X_train)
vector_test = TfidfVectorizer(vocabulary=vocab).fit(X_test)
X_test_vect = vector_test.transform(X_test)

In [18]:
xgb = XGBClassifier()
xgb_1 = MultiOutputClassifier(xgb)
xgb_multi = xgb_1.fit(X_train_vect,y_train)
print('Train set accuracy {:.4f}'.format(xgb_multi.score(X_train_vect,y_train)))
print('Train set accuracy {:.4f}'.format(xgb_multi.score(X_test_vect,y_test)))

Train set accuracy 0.6208
Train set accuracy 0.4091


In [13]:
etc = ExtraTreesClassifier()
etc_1 = MultiOutputClassifier(etc)
etc_multi = etc_1.fit(X_train_vect,y_train)
print('Train set accuracy {:.4f}'.format(etc_multi.score(X_train_vect,y_train)))
print('Train set accuracy {:.4f}'.format(etc_multi.score(X_test_vect,y_test)))

Train set accuracy 0.9965
Train set accuracy 0.4793


In [19]:
lr = LogisticRegression()
lr_1 = MultiOutputClassifier(lr)
lr_multi = lr_1.fit(X_train_vect,y_train)
print('Train set accuracy {:.4f}'.format(lr_multi.score(X_train_vect,y_train)))
print('Train set accuracy {:.4f}'.format(lr_multi.score(X_test_vect,y_test)))

Train set accuracy 0.5975
Train set accuracy 0.4853


In [26]:
est = [('etc',etc),('xgb',xgb),('lr',lr)]
stacked = StackingClassifier(n_jobs=-1,estimators=est,final_estimator=lr,passthrough=True)

In [None]:
stacked_1 = MultiOutputClassifier(stacked)
stacked_multi = stacked_1.fit(X_train_vect,y_train)
print('Train set accuracy {:.4f}'.format(stacked_multi.score(X_train_vect,y_train)))
print('Train set accuracy {:.4f}'.format(stacked_multi.score(X_test_vect,y_test)))

# CATEGORY CLASSIFIER

In [28]:
X = df['clean_title']
y = df['category']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [30]:
vector_train = TfidfVectorizer(ngram_range=(1,2)).fit(X_train)
vocab = vector_train.vocabulary_
features = vector_train.get_feature_names()
X_train_vect = vector_train.transform(X_train)
vector_test = TfidfVectorizer(vocabulary=vocab).fit(X_test)
X_test_vect = vector_test.transform(X_test)

In [31]:
xgb = XGBClassifier()
xgb_1 = xgb.fit(X_train_vect,y_train)
print('Train set accuracy {:.4f}'.format(xgb_1.score(X_train_vect,y_train)))
print('Train set accuracy {:.4f}'.format(xgb_1.score(X_test_vect,y_test)))

Train set accuracy 0.7616
Train set accuracy 0.6157


In [32]:
etc = ExtraTreesClassifier()
etc_1 = etc.fit(X_train_vect,y_train)
print('Train set accuracy {:.4f}'.format(etc_1.score(X_train_vect,y_train)))
print('Train set accuracy {:.4f}'.format(etc_1.score(X_test_vect,y_test)))

Train set accuracy 0.9983
Train set accuracy 0.6933


In [33]:
lgbm = LGBMClassifier()
lgbm_1 = lgbm.fit(X_train_vect,y_train)
print('Train set accuracy {:.4f}'.format(lgbm_1.score(X_train_vect,y_train)))
print('Train set accuracy {:.4f}'.format(lgbm_1.score(X_test_vect,y_test)))

Train set accuracy 0.7089
Train set accuracy 0.6024


In [34]:
lr = LogisticRegression()
lr_1 = lr.fit(X_train_vect,y_train)
print('Train set accuracy {:.4f}'.format(lr_1.score(X_train_vect,y_train)))
print('Train set accuracy {:.4f}'.format(lr_1.score(X_test_vect,y_test)))

Train set accuracy 0.8086
Train set accuracy 0.6983


In [37]:
est = [('etc',etc),('xgb',xgb),('lgbm',lgbm),('lr',lr)]
stacked = StackingClassifier(n_jobs=-1,estimators=est,final_estimator=lr,passthrough=True)

In [None]:
stacked_1 = stacked.fit(X_train_vect,y_train)
print('Train set accuracy {:.4f}'.format(stacked_1.score(X_train_vect,y_train)))
print('Train set accuracy {:.4f}'.format(stacked_1.score(X_test_vect,y_test)))

In [None]:
# Train set accuracy 0.9961
# Train set accuracy 0.7208