In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import classification_report,accuracy_score,f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier,AdaBoostClassifier,VotingClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def input_file(path,sep=','):
    
    return pd.read_csv(path,delimiter=sep)

In [3]:
def describe(file,col=None):
    
    print 'Columns :'
    print list(file.columns.values)
    if col:
        print '\nNull Values :',file[col].isnull().sum()
        print '\nUnique Values:'
        print list(np.unique(file[col]))
        print '\nValue Counts :'
        print file[col].value_counts()

In [4]:
def get_unique(col):
    
    return list(np.unique(list(np.unique(train[col]))+list(np.unique(test[col]))))

In [5]:
def preProcess(df):
    
    df=df.drop(['item_id', 'ISBN', 'Product Long Description', 'Product Name',
               'Actual Color','Color','actual_color'],axis=1)
    
    #Short Description Recommended Location Aspect Ratio MPAA Rating Actors Recommended Room Recommended Use
    #Product Short Description Seller Artist ID Literary Genre Genre ID Synopsis Item Class ID
    
    df.loc[df['Short Description'].isnull(),'Short Description']=' '
    df.loc[df['Short Description']=='short description is not available','Short Description']=' '
    df.loc[df['Recommended Location'].isnull(),'Recommended Location']=' '
    df.loc[df['Aspect Ratio'].isnull(),'Aspect Ratio']=' '
    df.loc[df['MPAA Rating'].isnull(),'MPAA Rating']=' '
    df.loc[df['Actors'].isnull(),'Actors']=' '
    df.loc[df['Recommended Room'].isnull(),'Recommended Room']=' '
    df.loc[df['Recommended Use'].isnull(),'Recommended Use']=' '
    df.loc[df['Product Short Description'].isnull(),'Product Short Description']=' '
    df.loc[df['Seller'].isnull(),'Seller']=' '
    df.loc[df['Artist ID'].isnull(),'Artist ID']=' '
    df.loc[df['Literary Genre'].isnull(),'Literary Genre']=' '
    df.loc[df['Publisher'].isnull(),'Publisher']=' '
    df.loc[df['Genre ID'].isnull(),'Genre ID']=' '
    df.loc[df['Synopsis'].isnull(),'Synopsis']=' '
    df.loc[df['Item Class ID'].isnull(),'Item Class ID']=' '
    
    combined=df['Item Class ID']*10+' '+df['Short Description']+' '+df['Recommended Location']+' '+df['Aspect Ratio']+' '+df['MPAA Rating']+' '+df['Actors']+' '+df['Recommended Room']+' '+df['Recommended Use']+' ' +df['Product Short Description']+' '+df['Seller']+' '+df['Artist ID'].apply(str)+' '+df['Literary Genre']+' '+df['Publisher']+' '+df['Genre ID'].apply(str)+' '+df['Synopsis']
    
    df=df.drop(['Short Description','Recommended Location','Aspect Ratio','MPAA Rating','Actors',
                'Recommended Room','Recommended Use','Product Short Description','Seller','Artist ID',
                'Literary Genre','Publisher','Genre ID','Synopsis','Item Class ID'],axis=1)
    
    df=pd.concat([df,pd.DataFrame({'combined_features':combined})],axis=1)
    
    return df

In [6]:
train=input_file('products-shelves-tagging-dataset/train.tsv','\t')
test=input_file('products-shelves-tagging-dataset/test.tsv','\t')
item_ids=test['item_id']

In [7]:
#describe(train,'Short Description')

In [8]:
print 'Pre-Processing Training Data'
train=preProcess(train)

print 'Pre-Processing Test Data'
test=preProcess(test)

Pre-Processing Training Data
Pre-Processing Test Data


In [None]:

X_train,X_test,y_train,y_test=train_test_split(train['combined_features'],train['tag'],test_size=0.005)
vectorizer=TfidfVectorizer(stop_words='english')
X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)
model=BaggingClassifier(verbose=1,n_estimators=100,n_jobs=-1,random_state=10)
print model

print 'Fitting Model'
model.fit(X_train,y_train)

print 'Predicting Output'
y_pred=model.predict(X_test)

print y_pred
print "***********"
print y_test
#print 'F1 Score :',classification_report(y_test,y_pred)


BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=100, n_jobs=-1, oob_score=False, random_state=10,
         verbose=1, warm_start=False)
Fitting Model


In [198]:
'''
if __name__=='__main__':

    pipeline=Pipeline([('vect',TfidfVectorizer(stop_words='english')),
                       ('clf',LogisticRegression())])
                       
    parameters={
        'vect__max_df':(1,10,100,1000,10000)
        }
        
    grid_search=GridSearchCV(pipeline,parameters,verbose=1,scoring='f1',n_jobs=-1,cv=5)
    
    grid_search.fit(X_train,y_train)
    
    print 'Best Score :',grid_search.best_score_
    
    best_parameters=grid_search.best_estimator_.get_params()
    
    for param_name in parameters.key():
        print param_name,best_parameters[param_name]
'''

"\nif __name__=='__main__':\n    pipeline=Pipeline([('vect',TfidfVectorizer(stop_words='english')),\n                       ('clf',LogisticRegression())])\n    parameters={\n        'vect__max_df':(1,10,100,1000,10000)\n        }\n    grid_search=GridSearchCV(pipeline,parameters,verbose=1,scoring='f1',n_jobs=-1,cv=5)\n    grid_search.fit(X_train,y_train)\n    print 'Best Score :',grid_search.best_score_\n    best_parameters=grid_search.best_estimator_.get_params()\n    for param_name in parameters.key():\n        print param_name,best_parameters[param_name]\n"

In [199]:

model=BaggingClassifier(verbose=1,n_estimators=100,n_jobs=-1,random_state=10)
print model

X_train=train['combined_features']
y_train=train['tag']

test=test['combined_features']
vectorizer=TfidfVectorizer(stop_words='english')

X_train=vectorizer.fit_transform(X_train)
test=vectorizer.transform(test)

model.fit(X_train,y_train)
y_pred=model.predict(test)

y_pred=pd.DataFrame({'tag':y_pred})
tags=pd.concat([item_ids,y_pred],axis=1)
tags.to_csv('tags.tsv',sep='\t',index=False)


BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=100, n_jobs=-1, oob_score=False, random_state=10,
         verbose=1, warm_start=False)


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  8.1min remaining:  8.1min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  8.4min finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  1.4min remaining:  1.4min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  1.7min finished
