In [1]:
#importing the library
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score

In [2]:
#loading the data
df=pd.read_csv('../input/sales-analysis/SalesKaggle3.csv')
df

Unnamed: 0,Order,File_Type,SKU_number,SoldFlag,SoldCount,MarketingType,ReleaseNumber,New_Release_Flag,StrengthFactor,PriceReg,ReleaseYear,ItemCount,LowUserPrice,LowNetPrice
0,2,Historical,1737127,0.0,0.0,D,15,1,6.827430e+05,44.99,2015,8,28.97,31.84
1,3,Historical,3255963,0.0,0.0,D,7,1,1.016014e+06,24.81,2005,39,0.00,15.54
2,4,Historical,612701,0.0,0.0,D,0,0,3.404640e+05,46.00,2013,34,30.19,27.97
3,6,Historical,115883,1.0,1.0,D,4,1,3.340110e+05,100.00,2006,20,133.93,83.15
4,7,Historical,863939,1.0,1.0,D,2,1,1.287938e+06,121.95,2010,28,4.00,23.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198912,208023,Active,109683,,,D,7,1,2.101869e+05,72.87,2006,54,8.46,60.59
198913,208024,Active,416462,,,D,8,1,4.555041e+05,247.00,2009,65,8.40,74.85
198914,208025,Active,658242,,,S,2,1,1.692746e+05,50.00,2012,23,23.98,32.62
198915,208026,Active,2538340,,,S,2,1,3.775266e+05,46.95,2001,23,27.42,37.89


In [3]:
#getting about the information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198917 entries, 0 to 198916
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Order             198917 non-null  int64  
 1   File_Type         198917 non-null  object 
 2   SKU_number        198917 non-null  int64  
 3   SoldFlag          75996 non-null   float64
 4   SoldCount         75996 non-null   float64
 5   MarketingType     198917 non-null  object 
 6   ReleaseNumber     198917 non-null  int64  
 7   New_Release_Flag  198917 non-null  int64  
 8   StrengthFactor    198917 non-null  float64
 9   PriceReg          198917 non-null  float64
 10  ReleaseYear       198917 non-null  int64  
 11  ItemCount         198917 non-null  int64  
 12  LowUserPrice      198917 non-null  float64
 13  LowNetPrice       198917 non-null  float64
dtypes: float64(6), int64(6), object(2)
memory usage: 21.2+ MB


# Preprocessing

In [4]:
#creating 
def preprocess_inputs(df):
    df=df.copy()
    #getting only particular dataset
    df=df.query("File_Type=='Historical'")
    #dropping the order column from the dataset
    #sinsce soldflag and soldcount is same so dropping soldcount
    df=df.drop(['File_Type','Order','SoldCount'],axis=1)
    #shuffling the data
    df=df.sample(frac=1.0,random_state=1)
    #spliting dataset into x and y
    y=df['SoldFlag']
    x=df.drop('SoldFlag',axis=1)
    return x,y

In [5]:
x,y=preprocess_inputs(df)


In [6]:
x

Unnamed: 0,SKU_number,MarketingType,ReleaseNumber,New_Release_Flag,StrengthFactor,PriceReg,ReleaseYear,ItemCount,LowUserPrice,LowNetPrice
37862,130583,S,12,1,545082.0,96.67,2011,12,73.74,101.33
35304,714748,S,2,1,4273940.0,58.00,2002,32,85.60,23.98
26138,532088,D,9,1,165834.0,76.95,2011,48,75.57,42.67
52327,532867,S,22,1,79220.0,54.25,2012,31,36.47,22.49
6038,50582,D,8,1,80014.0,38.99,2008,62,153.24,69.43
...,...,...,...,...,...,...,...,...,...,...
20609,110243,D,8,1,40841.0,103.24,2010,48,99.50,115.55
21440,244366,D,0,0,1611172.0,86.64,2011,19,55.19,78.38
73349,130281,S,2,1,1628317.0,69.99,2004,43,4.02,30.43
50057,116526,S,2,1,1660915.0,44.00,2004,32,34.51,10.12


In [27]:
y_train.value_counts()

0.0    50414
1.0    50414
Name: SoldFlag, dtype: int64

# Building Pipeline

In [10]:
def build_pipeline():
    nominal_transformer=Pipeline(steps=[
        ('onehot',OneHotEncoder(sparse=False,drop='if_binary'))
    ])
    preprocessor=ColumnTransformer(transformers=[
       ('nominal',nominal_transformer,['MarketingType'])
    ],remainder='passthrough')
    
    model=Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('regressor',RandomForestClassifier(random_state=1))
        
    ])
    return model

In [24]:
#kfold split the data in sequencial order 
acc=[]
f1s=[]
#training the data
kf=KFold(n_splits=5)
for train_idx,test_idx in kf.split(x):
    x_train=x.iloc[train_idx,:]
    x_test=x.iloc[test_idx,:]
    y_train=y.iloc[train_idx]
    y_test=y.iloc[test_idx]
    #Addressing class imbalance
    #mean for the training set
    #num_samples=int(y_train.value_counts().mean())
    #saving the majory indices to a variable
    #majority_indices=y_train[y_train==0.0].index
    #droppng the sample to make both classes equal
    #samples_to_drop=y_train[majority_indices].sample(len(y_train)-num_samples,random_state=1).index
    #x_train=x_train.drop(samples_to_drop,axis=0)
    #y_train=y_train.drop(samples_to_drop,axis=0)
    
    #Randomoversamples reduces the majority class equal to minority class
    #Scoring the model to variable
    oversampler=RandomOverSampler(random_state=1)
    #using model to reduce to majority and make them equal 
    x_train,y_train=oversampler.fit_resample(x_train,y_train)
    model=build_pipeline()
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    acc.append(accuracy_score(y_test,y_pred))
    f1s.append(f1_score(y_test,y_pred,pos_label=1.0))

acc=np.mean(acc)
f1=np.mean(f1s)

In [26]:
print(acc)
print(f1)

0.8270303932737957
0.3353283004513788


In [15]:
#importing confusion matrix
from sklearn.metrics import confusion_matrix

In [16]:
confusion_matrix(y_test,y_pred)

array([[12276,   310],
       [ 2194,   419]])

In [None]:
#getting data with particular rows
x.query("File_Type=='Historical'").isna().sum()

In [20]:
y_train[y_train==0.0].index

Int64Index([37862, 35304, 26138, 52327,  6038, 12076,  8933, 36633,  2956,
            47029,
            ...
            15435, 60998, 74162,  8989, 41360, 44120, 63825, 57797, 56319,
            62732],
           dtype='int64', length=50414)

In [None]:
df['MarketingType'].unique()

In [None]:
pd.get_dummies(df['MarketingType'])