In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
pd.set_option('display.max_colwidth', -1)
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_data = pd.read_csv('./train/train.csv')
test_data = pd.read_csv('test_nvPHrOx.csv')

In [4]:
train_data.head()

Unnamed: 0,Webpage_id,Domain,Url,Tag
0,1,www.fiercepharma.com,http://www.fiercepharma.com/marketing/tecfidera-gilenya-and-aubagio-s-3-way-battle-for-ms-share-about-to-get-more-interesting,news
1,2,www.fiercepharma.com,http://www.fiercepharma.com/pharma/novo-equipped-to-weather-storm-u-s-diabetes-market-ceo-says,news
2,3,www.fiercepharma.com,http://www.fiercepharma.com/pharma/another-exec-departs-troubled-endo-and-time-it-s-for-another-drugmaker,news
3,4,www.fiercepharma.com,http://www.fiercepharma.com/pharma/teva-buy-biosim-specialist-celltrion-it-wouldn-t-say-no,news
4,5,www.fiercepharma.com,http://www.fiercepharma.com/marketing/actress-marissa-tomei-partners-allergan-restasis-to-drive-dry-eye-awareness,news


In [5]:
sample_url = train_data['Url'][0]
sample_url.split('/')[3]

'marketing'

In [6]:
def get_features_from_url(row):
    url = row['Url']
    url = re.sub(r'[\/\.\:\-]',' ',url)
    url = ' '.join(url.split())
    return url

In [7]:
train_data['domain_name'] = train_data.Domain.apply(lambda x: x.split('.')[1])
train_data['words_in_url'] = train_data.apply(get_features_from_url,axis=1)

In [8]:
test_data['domain_name'] = test_data.Domain.apply(lambda x: x.split('.')[1])
test_data['words_in_url'] = test_data.apply(get_features_from_url,axis=1)

In [9]:
domain_name = train_data[['Domain','domain_name','Url','Tag']]
domain_name.to_csv('domain_name.csv',sep=',',index=None)

In [22]:
tfidf_vectorizer_domain_name = TfidfVectorizer(analyzer='word',
                                         norm='l2',max_features=1200,ngram_range=(1,2))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    train_data['words_in_url'], train_data['Tag'], test_size=0.25, random_state=42,stratify=train_data['Tag'])

In [20]:
X_train = tfidf_vectorizer_domain_name.fit_transform(X_train).toarray()
X_test = tfidf_vectorizer_domain_name.transform(X_test).toarray()

In [17]:
clf_SGD = SGDClassifier(n_iter=20,random_state=2)

In [20]:
clf_SGD.fit(X_train,y_train)
clf_SGD.score(X_test,y_test)

0.89110911540188598

In [21]:
clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(X_train,y_train)
clf_lr.score(X_test,y_test)

0.89410267923963482

In [60]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)
rf.score(X_test,y_test)

0.93526418200868133

In [35]:
vectorizer_train_data = TfidfVectorizer(analyzer='word',
                                         norm='l2',max_features=1200,ngram_range=(1,3))


In [36]:
X_final = vectorizer_train_data.fit_transform(train_data['words_in_url']).toarray()
y = train_data['Tag']

In [37]:
X_test_final = vectorizer_train_data.transform(test_data['words_in_url']).toarray()

In [64]:
rf_final = RandomForestClassifier(random_state=0)
rf_final.fit(X_final,y)

rf_predictions = rf_final.predict(X_test_final)
predictions = pd.DataFrame({'Webpage_id':test_data['Webpage_id'],
                           'Tag':rf_predictions})
predictions.to_csv('submission_rf.csv',sep=',',index=None,columns=['Webpage_id','Tag'])

In [38]:
clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(X_final,y)

lr_predictions = clf_lr.predict(X_test_final)
predictions = pd.DataFrame({'Webpage_id':test_data['Webpage_id'],
                           'Tag':lr_predictions})
predictions.to_csv('submission_lr.csv',sep=',',index=None,columns=['Webpage_id','Tag'])

In [65]:
clf_SGD_final = SGDClassifier(n_iter=20,random_state=2)
clf_SGD_final.fit(X_final,y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=20, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=2, shuffle=True, verbose=0,
       warm_start=False)

In [66]:
tags_precited = clf_SGD_final.predict(X_test_final)
predictions = pd.DataFrame({'Webpage_id':test_data['Webpage_id'],
                           'Tag':tags_precited})
predictions.to_csv('submission_5.csv',sep=',',index=None,columns=['Webpage_id','Tag'])

In [27]:
clf_svm = LinearSVC(random_state=0,C=1,max_iter=10000)

In [28]:
clf_svm.fit(X_train,y_train)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=10000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0)

In [29]:
clf_svm.score(X_test,y_test)

0.79073900841908329

In [30]:
test_data = pd.read_csv('test_nvPHrOx.csv')

In [31]:
test_data['domain_name'] = test_data.Domain.apply(lambda x: x.split('.')[1])

In [42]:
vectorizer_train_data = TfidfVectorizer(analyzer='word',
                                         norm='l2')


In [43]:
X_final = vectorizer_train_data.fit_transform(train_data['domain_name']).toarray()
y = train_data['Tag']

In [44]:
X_test_final = vectorizer_train_data.transform(test_data['domain_name']).toarray()

In [52]:
clf_SGD_final = SGDClassifier(n_iter=20,random_state=2)
clf_SGD_final.fit(X_final,y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=20, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=2, shuffle=True, verbose=0,
       warm_start=False)

In [45]:
clf_svm_final = LinearSVC()
clf_svm_final.fit(X_final,y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [53]:
tags_precited = clf_SGD_final.predict(X_test_final)

In [54]:
tags_precited = clf_SGD_final.predict(X_test_final)
predictions = pd.DataFrame({'Webpage_id':test_data['Webpage_id'],
                           'Tag':tags_precited})
predictions.to_csv('submission_1.csv',sep=',',index=None,columns=['Webpage_id','Tag'])

In [55]:
predictions.to_csv('submission_1.csv',sep=',',index=None,columns=['Webpage_id','Tag'])