In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from scipy import stats
import category_encoders as ce

In [2]:
file_train = pd.read_csv('counterfeit_train.csv')
file_test = pd.read_csv('counterfeit_test.csv')

In [3]:
file_train.head()

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales
0,RRA15,13.1,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013,CityLimits,Tier 3,Medium,3069.152
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.092
3,GWC40,11.8,Area046,1995,99.983,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.713
4,QMN13,,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402


In [4]:
file_train.dtypes

Medicine_ID             object
Counterfeit_Weight     float64
DistArea_ID             object
Active_Since             int64
Medicine_MRP           float64
Medicine_Type           object
SidEffect_Level         object
Availability_rating    float64
Area_Type               object
Area_City_Type          object
Area_dist_level         object
Counterfeit_Sales      float64
dtype: object

In [5]:
file_train.isnull().sum()

Medicine_ID               0
Counterfeit_Weight     1166
DistArea_ID               0
Active_Since              0
Medicine_MRP              0
Medicine_Type             0
SidEffect_Level           0
Availability_rating       0
Area_Type                 0
Area_City_Type            0
Area_dist_level           0
Counterfeit_Sales         0
dtype: int64

In [6]:
file_test.head()

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level
0,HLZ81,,Area027,1983,85.5328,Antibiotics,mild,0.112747,CityLimits,Tier 3,Medium
1,ECE94,13.45,Area045,2000,257.146,OralContraceptives,mild,0.144446,DownTown,Tier 2,Unknown
2,SAD14,7.1,Area045,2000,98.1172,Antipyretics,mild,0.144221,DownTown,Tier 2,Unknown
3,EQV63,18.3,Area010,1996,135.373,Tranquilizers,mild,0.100388,MidTownResidential,Tier 3,Unknown
4,AIR10,,Area019,1983,112.8016,OralContraceptives,mild,0.022585,MidTownResidential,Tier 1,Small


In [7]:
file_test.isnull().sum()

Medicine_ID              0
Counterfeit_Weight     297
DistArea_ID              0
Active_Since             0
Medicine_MRP             0
Medicine_Type            0
SidEffect_Level          0
Availability_rating      0
Area_Type                0
Area_City_Type           0
Area_dist_level          0
dtype: int64

In [8]:
file_train.shape

(6818, 12)

In [9]:
file_test.shape

(1705, 11)

In [10]:
file_test['Counterfeit_Sales'] = np.nan
file_test = file_test[file_train.columns]
file_train['Data'] = 'Train'
file_test['Data'] = 'Test'

In [12]:
zscore = np.abs(stats.zscore(file_train['Counterfeit_Sales']))

threshold=3
value = np.where(zscore>threshold)[0]

file_train.drop(value, inplace=True)
#print(np.where(zscore_comp>threshold), len(drp_idx))

In [13]:
file_train.shape

(6752, 13)

In [14]:
file_test.shape

(1705, 13)

In [15]:
file_train.isnull().sum()

Medicine_ID               0
Counterfeit_Weight     1121
DistArea_ID               0
Active_Since              0
Medicine_MRP              0
Medicine_Type             0
SidEffect_Level           0
Availability_rating       0
Area_Type                 0
Area_City_Type            0
Area_dist_level           0
Counterfeit_Sales         0
Data                      0
dtype: int64

In [16]:
file_test.isnull().sum()

Medicine_ID               0
Counterfeit_Weight      297
DistArea_ID               0
Active_Since              0
Medicine_MRP              0
Medicine_Type             0
SidEffect_Level           0
Availability_rating       0
Area_Type                 0
Area_City_Type            0
Area_dist_level           0
Counterfeit_Sales      1705
Data                      0
dtype: int64

In [17]:
file_conc = pd.concat([file_train,file_test], axis=0)

In [18]:
file_conc

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales,Data
0,RRA15,13.100,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026,Train
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013000,CityLimits,Tier 3,Medium,3069.1520,Train
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.0920,Train
3,GWC40,11.800,Area046,1995,99.9830,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.7130,Train
4,QMN13,,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402,Train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1700,KXW10,,Area027,1983,136.5704,Hreplacements,mild,0.050505,CityLimits,Tier 3,Medium,,Test
1701,CKE54,21.300,Area035,2002,57.0744,Antibiotics,critical,0.041118,DownTown,Tier 2,Small,,Test
1702,HAY13,20.400,Area017,2005,182.7422,Antiseptics,mild,0.191273,DownTown,Tier 2,Unknown,,Test
1703,ZEE32,20.000,Area018,2007,266.9672,Hreplacements,mild,0.013000,Industrial,Tier 3,Medium,,Test


In [19]:
file_conc.shape

(8457, 13)

In [20]:
file_conc.isnull().sum()

Medicine_ID               0
Counterfeit_Weight     1418
DistArea_ID               0
Active_Since              0
Medicine_MRP              0
Medicine_Type             0
SidEffect_Level           0
Availability_rating       0
Area_Type                 0
Area_City_Type            0
Area_dist_level           0
Counterfeit_Sales      1705
Data                      0
dtype: int64

In [21]:
miss_val_repla = file_conc.groupby(['Medicine_ID'])['Counterfeit_Weight'].mean().to_dict()
for key, values in miss_val_repla.items():
    file_conc.loc[file_conc['Medicine_ID']== key, 'Counterfeit_Weight'] = np.round(values)

# file_conc.drop(file_conc[file_conc['Counterfeit_Weight'].isna()].index ,  inplace=True)

In [22]:
file_conc.isnull().sum()

Medicine_ID               0
Counterfeit_Weight        4
DistArea_ID               0
Active_Since              0
Medicine_MRP              0
Medicine_Type             0
SidEffect_Level           0
Availability_rating       0
Area_Type                 0
Area_City_Type            0
Area_dist_level           0
Counterfeit_Sales      1705
Data                      0
dtype: int64

In [23]:
for col in file_conc.columns:
    if (col not in ['Counterfeit_Sales', 'Data']) & (file_conc[col].isnull().sum()>0):
        file_conc.loc[file_conc[col].isnull(),col] = file_conc.loc[file_conc['Data'] == 'Train',col].mean()

In [24]:
file_conc.isnull().sum()

Medicine_ID               0
Counterfeit_Weight        0
DistArea_ID               0
Active_Since              0
Medicine_MRP              0
Medicine_Type             0
SidEffect_Level           0
Availability_rating       0
Area_Type                 0
Area_City_Type            0
Area_dist_level           0
Counterfeit_Sales      1705
Data                      0
dtype: int64

In [25]:
# file_conc['Counterfeit_Weight'] = file_conc.groupby('Medicine_ID').transform(lambda x: x.fillna(x.mean()))

In [None]:
import seaborn as sns
%matplotlib inline

In [None]:
# sns.histplot(file_conc['Counterfeit_Weight'])

In [None]:
# sns.displot(file_conc['Counterfeit_Weight'])

In [None]:
# file_conc['Counterfeit_Weight'].isnull().sum()

In [None]:
# file_conc['Counterfeit_Weight'].mean()

In [None]:
# file_conc['Counterfeit_Weight'].median()

In [None]:
# file_conc['Counterfeit_Weight'].mode()

In [26]:
# file_conc.drop(['Medicine_ID', 'Counterfeit_Weight'],axis=1,inplace=True)
file_conc.drop(['Medicine_ID'],axis=1,inplace=True)

In [27]:
file_conc

Unnamed: 0,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales,Data
0,13.0,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026,Train
1,7.0,Area027,1983,110.4384,Mstablizers,mild,0.013000,CityLimits,Tier 3,Medium,3069.1520,Train
2,9.0,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.0920,Train
3,12.0,Area046,1995,99.9830,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.7130,Train
4,9.0,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402,Train
...,...,...,...,...,...,...,...,...,...,...,...,...
1700,17.0,Area027,1983,136.5704,Hreplacements,mild,0.050505,CityLimits,Tier 3,Medium,,Test
1701,21.0,Area035,2002,57.0744,Antibiotics,critical,0.041118,DownTown,Tier 2,Small,,Test
1702,20.0,Area017,2005,182.7422,Antiseptics,mild,0.191273,DownTown,Tier 2,Unknown,,Test
1703,20.0,Area018,2007,266.9672,Hreplacements,mild,0.013000,Industrial,Tier 3,Medium,,Test


In [28]:
file_conc.shape

(8457, 12)

In [29]:
cat_col = file_conc.select_dtypes(['object']).columns

In [30]:
cat_col

Index(['DistArea_ID', 'Medicine_Type', 'SidEffect_Level', 'Area_Type',
       'Area_City_Type', 'Area_dist_level', 'Data'],
      dtype='object')

In [31]:
type(cat_col)

pandas.core.indexes.base.Index

In [32]:
cat_col =cat_col[:-1]

In [33]:
cat_col

Index(['DistArea_ID', 'Medicine_Type', 'SidEffect_Level', 'Area_Type',
       'Area_City_Type', 'Area_dist_level'],
      dtype='object')

In [34]:
cat_col.value_counts()

SidEffect_Level    1
Area_dist_level    1
DistArea_ID        1
Area_Type          1
Area_City_Type     1
Medicine_Type      1
dtype: int64

In [35]:
# for col in cat_col:
#     freqns= file_conc[col].value_counts()
#     k = freqns.index[freqns>80][:-1]
#     for cat in k:
#         name=col+'_'+cat
#         file_conc[name]=(file_conc[col]==cat).astype(int)
#     del file_conc[col]
#     print(col)

In [36]:
for col in ['DistArea_ID', 'Medicine_Type', 'SidEffect_Level', 'Area_Type','Area_City_Type', 'Area_dist_level']:
            temp = pd.get_dummies(file_conc[col], prefix=col, drop_first=True)
            file_conc = pd.concat([temp, file_conc], 1)
            file_conc.drop([col],1,inplace=True)

In [37]:
# #DistArea_ID , Medicine_Type

# import category_encoders as ce

# file_conc['DistArea_ID'] = file_conc['DistArea_ID'].replace('Area', '')
# be = ce.BinaryEncoder(cols=['DistArea_ID'])
# City_code = be.fit_transform(file_conc['DistArea_ID'] , file_conc['Counterfeit_Sales'])
# file_conc  = pd.concat([file_conc, City_code] , axis=1)

# #del all_data['DistArea_ID_0']
# Medicine_type = file_conc['Medicine_Type'].value_counts().to_dict()
# file_conc['Medicine_Type'] = file_conc['Medicine_Type'].replace(Medicine_type)

# file_conc.drop(['DistArea_ID'], axis=1,inplace=True)
# #all_data.drop(['Active_Since'],axis=1,inplace=True)
# file_conc.drop('Medicine_Type', axis=1, inplace=True)

In [38]:
file_conc.columns

Index(['Area_dist_level_Medium', 'Area_dist_level_Small',
       'Area_dist_level_Unknown', 'Area_City_Type_Tier 2',
       'Area_City_Type_Tier 3', 'Area_Type_DownTown', 'Area_Type_Industrial',
       'Area_Type_MidTownResidential', 'SidEffect_Level_mild',
       'Medicine_Type_Antacids', 'Medicine_Type_Antibiotics',
       'Medicine_Type_Antifungal', 'Medicine_Type_Antimalarial',
       'Medicine_Type_Antipyretics', 'Medicine_Type_Antiseptics',
       'Medicine_Type_Antiviral', 'Medicine_Type_Cardiac',
       'Medicine_Type_Hreplacements', 'Medicine_Type_Mstablizers',
       'Medicine_Type_MuscleRelaxants', 'Medicine_Type_OralContraceptives',
       'Medicine_Type_Statins', 'Medicine_Type_Stimulants',
       'Medicine_Type_Tranquilizers', 'DistArea_ID_Area013',
       'DistArea_ID_Area017', 'DistArea_ID_Area018', 'DistArea_ID_Area019',
       'DistArea_ID_Area027', 'DistArea_ID_Area035', 'DistArea_ID_Area045',
       'DistArea_ID_Area046', 'DistArea_ID_Area049', 'Counterfeit_Weight',

In [39]:
file_conc.dtypes

Area_dist_level_Medium                uint8
Area_dist_level_Small                 uint8
Area_dist_level_Unknown               uint8
Area_City_Type_Tier 2                 uint8
Area_City_Type_Tier 3                 uint8
Area_Type_DownTown                    uint8
Area_Type_Industrial                  uint8
Area_Type_MidTownResidential          uint8
SidEffect_Level_mild                  uint8
Medicine_Type_Antacids                uint8
Medicine_Type_Antibiotics             uint8
Medicine_Type_Antifungal              uint8
Medicine_Type_Antimalarial            uint8
Medicine_Type_Antipyretics            uint8
Medicine_Type_Antiseptics             uint8
Medicine_Type_Antiviral               uint8
Medicine_Type_Cardiac                 uint8
Medicine_Type_Hreplacements           uint8
Medicine_Type_Mstablizers             uint8
Medicine_Type_MuscleRelaxants         uint8
Medicine_Type_OralContraceptives      uint8
Medicine_Type_Statins                 uint8
Medicine_Type_Stimulants        

In [40]:
file_conc.shape

(8457, 39)

In [41]:
file_conc.loc[file_conc['Data'] == 'Train'].isnull().sum()

Area_dist_level_Medium              0
Area_dist_level_Small               0
Area_dist_level_Unknown             0
Area_City_Type_Tier 2               0
Area_City_Type_Tier 3               0
Area_Type_DownTown                  0
Area_Type_Industrial                0
Area_Type_MidTownResidential        0
SidEffect_Level_mild                0
Medicine_Type_Antacids              0
Medicine_Type_Antibiotics           0
Medicine_Type_Antifungal            0
Medicine_Type_Antimalarial          0
Medicine_Type_Antipyretics          0
Medicine_Type_Antiseptics           0
Medicine_Type_Antiviral             0
Medicine_Type_Cardiac               0
Medicine_Type_Hreplacements         0
Medicine_Type_Mstablizers           0
Medicine_Type_MuscleRelaxants       0
Medicine_Type_OralContraceptives    0
Medicine_Type_Statins               0
Medicine_Type_Stimulants            0
Medicine_Type_Tranquilizers         0
DistArea_ID_Area013                 0
DistArea_ID_Area017                 0
DistArea_ID_

In [42]:
file_conc.loc[file_conc['Data'] == 'Test'].isnull().sum()

Area_dist_level_Medium                 0
Area_dist_level_Small                  0
Area_dist_level_Unknown                0
Area_City_Type_Tier 2                  0
Area_City_Type_Tier 3                  0
Area_Type_DownTown                     0
Area_Type_Industrial                   0
Area_Type_MidTownResidential           0
SidEffect_Level_mild                   0
Medicine_Type_Antacids                 0
Medicine_Type_Antibiotics              0
Medicine_Type_Antifungal               0
Medicine_Type_Antimalarial             0
Medicine_Type_Antipyretics             0
Medicine_Type_Antiseptics              0
Medicine_Type_Antiviral                0
Medicine_Type_Cardiac                  0
Medicine_Type_Hreplacements            0
Medicine_Type_Mstablizers              0
Medicine_Type_MuscleRelaxants          0
Medicine_Type_OralContraceptives       0
Medicine_Type_Statins                  0
Medicine_Type_Stimulants               0
Medicine_Type_Tranquilizers            0
DistArea_ID_Area

In [43]:
train_data = file_conc[file_conc['Data'] =='Train']
del train_data['Data']

In [44]:
train_data.isnull().sum()

Area_dist_level_Medium              0
Area_dist_level_Small               0
Area_dist_level_Unknown             0
Area_City_Type_Tier 2               0
Area_City_Type_Tier 3               0
Area_Type_DownTown                  0
Area_Type_Industrial                0
Area_Type_MidTownResidential        0
SidEffect_Level_mild                0
Medicine_Type_Antacids              0
Medicine_Type_Antibiotics           0
Medicine_Type_Antifungal            0
Medicine_Type_Antimalarial          0
Medicine_Type_Antipyretics          0
Medicine_Type_Antiseptics           0
Medicine_Type_Antiviral             0
Medicine_Type_Cardiac               0
Medicine_Type_Hreplacements         0
Medicine_Type_Mstablizers           0
Medicine_Type_MuscleRelaxants       0
Medicine_Type_OralContraceptives    0
Medicine_Type_Statins               0
Medicine_Type_Stimulants            0
Medicine_Type_Tranquilizers         0
DistArea_ID_Area013                 0
DistArea_ID_Area017                 0
DistArea_ID_

In [45]:
test_data = file_conc[file_conc['Data']=='Test']
test_data.drop (['Counterfeit_Sales', 'Data'], axis=1, inplace = True)
# del test_data['Data']
# del test_data['Counterfeit_Sales'] 

In [46]:
test_data.isnull().sum()

Area_dist_level_Medium              0
Area_dist_level_Small               0
Area_dist_level_Unknown             0
Area_City_Type_Tier 2               0
Area_City_Type_Tier 3               0
Area_Type_DownTown                  0
Area_Type_Industrial                0
Area_Type_MidTownResidential        0
SidEffect_Level_mild                0
Medicine_Type_Antacids              0
Medicine_Type_Antibiotics           0
Medicine_Type_Antifungal            0
Medicine_Type_Antimalarial          0
Medicine_Type_Antipyretics          0
Medicine_Type_Antiseptics           0
Medicine_Type_Antiviral             0
Medicine_Type_Cardiac               0
Medicine_Type_Hreplacements         0
Medicine_Type_Mstablizers           0
Medicine_Type_MuscleRelaxants       0
Medicine_Type_OralContraceptives    0
Medicine_Type_Statins               0
Medicine_Type_Stimulants            0
Medicine_Type_Tranquilizers         0
DistArea_ID_Area013                 0
DistArea_ID_Area017                 0
DistArea_ID_

In [47]:
# del file_conc

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
train1, train2 = train_test_split(train_data, test_size = 0.2, random_state=2)

In [50]:
x_train1 = train1.drop('Counterfeit_Sales', axis=1)
y_train1 = train1['Counterfeit_Sales']
x_train2 = train2.drop('Counterfeit_Sales', axis=1)
y_train2 = train2['Counterfeit_Sales']

In [51]:
from sklearn.linear_model import LinearRegression

In [52]:
lr = LinearRegression()
lr.fit(x_train1,y_train1)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [53]:
x_train1.shape

(5401, 37)

In [54]:
y_train1.shape

(5401,)

In [55]:
lr.intercept_

51817.94510669846

In [56]:
list(zip(x_train1.columns, lr.coef_))

[('Area_dist_level_Medium', 534.1395400619714),
 ('Area_dist_level_Small', -111.75558981744166),
 ('Area_dist_level_Unknown', -286.51014740814986),
 ('Area_City_Type_Tier 2', 460.75565901204766),
 ('Area_City_Type_Tier 3', 19.11282489492355),
 ('Area_Type_DownTown', 327.15183279204297),
 ('Area_Type_Industrial', -24.700393587559613),
 ('Area_Type_MidTownResidential', -1112.5243079159075),
 ('SidEffect_Level_mild', 0.1580306685123152),
 ('Medicine_Type_Antacids', 21.455141587822304),
 ('Medicine_Type_Antibiotics', 59.316485853803606),
 ('Medicine_Type_Antifungal', 71.567719568153),
 ('Medicine_Type_Antimalarial', 7.6393297439803485),
 ('Medicine_Type_Antipyretics', -113.20507481064885),
 ('Medicine_Type_Antiseptics', -38.70649695191906),
 ('Medicine_Type_Antiviral', 223.72783847489274),
 ('Medicine_Type_Cardiac', 62.350292963529164),
 ('Medicine_Type_Hreplacements', -59.95561466821411),
 ('Medicine_Type_Mstablizers', 16.835950566482758),
 ('Medicine_Type_MuscleRelaxants', 40.74325441515

In [58]:
x_train2.shape

(1351, 37)

In [59]:
y_train2.shape

(1351,)

In [60]:
predicted_sales = lr.predict(x_train2)

In [61]:
predicted_sales

array([2743.91748967, 1513.59865503, 5293.36024902, ..., 1916.73270006,
       3603.33786586, 1866.47965879])

In [62]:
from sklearn.metrics import mean_absolute_error

In [63]:
mean_absolute_error(y_train2,predicted_sales)

796.2759238132572

In [64]:
x_train = train_data.drop('Counterfeit_Sales',axis=1)
y_train = train_data['Counterfeit_Sales']

In [65]:
lr.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [66]:
test_pred_sales = lr.predict(test_data)

In [67]:
test_pred_sales.shape

(1705,)

In [68]:
list(zip(x_train.columns, lr.coef_))

[('Area_dist_level_Medium', 532.6075456244698),
 ('Area_dist_level_Small', -112.9051882486007),
 ('Area_dist_level_Unknown', -294.07679440526306),
 ('Area_City_Type_Tier 2', 456.3643737952828),
 ('Area_City_Type_Tier 3', 17.806597567853295),
 ('Area_Type_DownTown', 328.8444215313003),
 ('Area_Type_Industrial', -21.799945758566743),
 ('Area_Type_MidTownResidential', -1109.8129175290874),
 ('SidEffect_Level_mild', -16.278102629743135),
 ('Medicine_Type_Antacids', 45.50535734821201),
 ('Medicine_Type_Antibiotics', 45.519178276509),
 ('Medicine_Type_Antifungal', 32.80353799457946),
 ('Medicine_Type_Antimalarial', -8.689673061102786),
 ('Medicine_Type_Antipyretics', -109.07673229553717),
 ('Medicine_Type_Antiseptics', -12.50958383188259),
 ('Medicine_Type_Antiviral', 343.610531621805),
 ('Medicine_Type_Cardiac', 64.96901602294737),
 ('Medicine_Type_Hreplacements', 1.1911640788706515),
 ('Medicine_Type_Mstablizers', 18.618736695064303),
 ('Medicine_Type_MuscleRelaxants', 77.93637175488689),


In [75]:
pd.DataFrame(test_pred_sales).to_csv('project3_LinReg.csv',index=False)

## Ridge 

In [69]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [70]:
lambdas = np.linspace(1,100,100)
params = {'alpha':lambdas}
model = Ridge(fit_intercept = True)

In [71]:
grid_search = GridSearchCV(model, param_grid = params, cv=10, scoring = 'neg_mean_absolute_error')

In [72]:
grid_search.fit(x_train1,y_train1)

GridSearchCV(cv=10, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,
        23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,
        34.,  35....,  41.,  42.,  43.,  44.,
        45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,
        56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,  66.,
        67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,  77.,
        78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,
        89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99.,
       100.])},
             pre_dispatch='2*n

In [73]:
grid_search.best_estimator_

Ridge(alpha=45.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [74]:
pred_sal_rid = grid_search.predict(x_train2)

In [75]:
pred_sal_rid

array([2725.92733917, 1494.17396522, 5230.95133444, ..., 1941.51926234,
       3599.90060433, 1884.21084486])

In [76]:
mean_absolute_error(y_train2,pred_sal_rid)

794.9186968407596

## Using Ridge on entire training data.

In [77]:
lambdas = np.linspace(1, 100, 100)
params = {'alpha':lambdas}
model = Ridge(fit_intercept = True)

In [78]:
grid_search = GridSearchCV(model, param_grid = params, cv=10, scoring = 'neg_mean_absolute_error')

In [79]:
grid_search.fit(x_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,
        23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,
        34.,  35....,  41.,  42.,  43.,  44.,
        45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,
        56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,  66.,
        67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,  77.,
        78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,
        89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99.,
       100.])},
             pre_dispatch='2*n

In [80]:
grid_search.best_estimator_

Ridge(alpha=63.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [81]:
grid_search.cv_results_

{'mean_fit_time': array([0.0118278 , 0.01097901, 0.01189396, 0.01196334, 0.01311669,
        0.01242297, 0.011221  , 0.0115051 , 0.01156967, 0.01201985,
        0.01249208, 0.01282501, 0.01296685, 0.01244955, 0.0131187 ,
        0.01218913, 0.0127188 , 0.01156747, 0.01232944, 0.01260817,
        0.01359558, 0.01291034, 0.0126363 , 0.01246138, 0.01173356,
        0.01177535, 0.01198452, 0.00986242, 0.00931511, 0.01094809,
        0.0128346 , 0.01277647, 0.00822122, 0.00684135, 0.0122503 ,
        0.00978336, 0.01162627, 0.01172943, 0.01344266, 0.01178126,
        0.01173551, 0.01265869, 0.01252432, 0.01166697, 0.01256773,
        0.01144793, 0.01197782, 0.01247716, 0.01266739, 0.01283009,
        0.01209748, 0.01216352, 0.01224663, 0.01251633, 0.01238122,
        0.01304526, 0.01197846, 0.0123467 , 0.0129133 , 0.01253169,
        0.01239684, 0.01239784, 0.0127058 , 0.01229417, 0.01200442,
        0.0124958 , 0.01229882, 0.01252055, 0.01261401, 0.01268947,
        0.01420255, 0.01263869,

In [82]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3E} (std: {1:.3})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [83]:
report(grid_search.cv_results_,100)

Model with rank: 1
Mean validation score: -7.905E+02 (std: 24.1)
Parameters: {'alpha': 63.0}

Model with rank: 2
Mean validation score: -7.905E+02 (std: 24.1)
Parameters: {'alpha': 62.0}

Model with rank: 3
Mean validation score: -7.905E+02 (std: 24.1)
Parameters: {'alpha': 64.0}

Model with rank: 4
Mean validation score: -7.905E+02 (std: 24.1)
Parameters: {'alpha': 61.0}

Model with rank: 5
Mean validation score: -7.905E+02 (std: 24.1)
Parameters: {'alpha': 65.0}

Model with rank: 6
Mean validation score: -7.905E+02 (std: 24.1)
Parameters: {'alpha': 60.0}

Model with rank: 7
Mean validation score: -7.905E+02 (std: 24.1)
Parameters: {'alpha': 66.0}

Model with rank: 8
Mean validation score: -7.905E+02 (std: 24.1)
Parameters: {'alpha': 59.0}

Model with rank: 9
Mean validation score: -7.905E+02 (std: 24.1)
Parameters: {'alpha': 58.0}

Model with rank: 10
Mean validation score: -7.905E+02 (std: 24.1)
Parameters: {'alpha': 67.0}

Model with rank: 11
Mean validation score: -7.905E+02 (std:

## COFFECIENTS IN RIDGE

In [84]:
ridge_model = grid_search.best_estimator_

In [85]:
ridge_model.fit(x_train,y_train)

Ridge(alpha=63.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [86]:
list(zip(x_train.columns,ridge_model.coef_))

[('Area_dist_level_Medium', 512.2362956079319),
 ('Area_dist_level_Small', -116.18519859770876),
 ('Area_dist_level_Unknown', -284.04328587510156),
 ('Area_City_Type_Tier 2', 430.5959579531008),
 ('Area_City_Type_Tier 3', 20.50701286967308),
 ('Area_Type_DownTown', 324.00956068007537),
 ('Area_Type_Industrial', -25.952718128443333),
 ('Area_Type_MidTownResidential', -1055.1519915615934),
 ('SidEffect_Level_mild', -16.381776623449397),
 ('Medicine_Type_Antacids', 22.07810442300854),
 ('Medicine_Type_Antibiotics', 26.880971881015515),
 ('Medicine_Type_Antifungal', 13.982250104673),
 ('Medicine_Type_Antimalarial', -21.45287076315363),
 ('Medicine_Type_Antipyretics', -114.3842753948635),
 ('Medicine_Type_Antiseptics', -25.943028947127427),
 ('Medicine_Type_Antiviral', 144.15851163372622),
 ('Medicine_Type_Cardiac', 45.82835820731156),
 ('Medicine_Type_Hreplacements', -13.873266516117198),
 ('Medicine_Type_Mstablizers', 2.6013954147934086),
 ('Medicine_Type_MuscleRelaxants', 40.325921720868

In [87]:
grid_search.best_score_

-790.4556099124482

In [88]:
grid_search.best_params_

{'alpha': 63.0}

In [89]:
final_sal_rid = grid_search.predict(test_data)

In [90]:
sales = pd.DataFrame(final_sal_rid)

In [91]:
df = pd.read_csv('sample_submission_P3.csv')

In [92]:
df

Unnamed: 0,Medicine_ID,Counterfeit_Sales
0,HLZ81,2522.529514
1,ECE94,3802.072094
2,SAD14,1397.417815
3,EQV63,291.229787
4,AIR10,-23.995078
...,...,...
1700,KXW10,3234.535810
1701,CKE54,1133.286170
1702,HAY13,2828.182002
1703,ZEE32,3765.961353


In [93]:
df["Counterfeit_Sales"] = sales

In [94]:
df

Unnamed: 0,Medicine_ID,Counterfeit_Sales
0,HLZ81,2522.529514
1,ECE94,3802.072094
2,SAD14,1397.417815
3,EQV63,291.229787
4,AIR10,-23.995078
...,...,...
1700,KXW10,3234.535810
1701,CKE54,1133.286170
1702,HAY13,2828.182002
1703,ZEE32,3765.961353


In [107]:
df.to_csv("sample_submission_P3.csv", index=False)

In [None]:
# pd.DataFrame(final_sal_rid).to_csv('project3_RidReg.csv',index=False)

## Lasso

In [95]:
from sklearn.linear_model import Lasso

In [96]:
lambdas1 = np.linspace(0.1,100,50)
params1 = {'alpha':lambdas}
model1 = Lasso(fit_intercept = True)

In [97]:
grid_search1 = GridSearchCV(model1, param_grid = params1, cv=10, n_jobs=-1, verbose=10, scoring = 'neg_mean_absolute_error')

In [98]:
grid_search1.fit(x_train1,y_train1)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  

GridSearchCV(cv=10, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  2...
        34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,  44.,
        45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,
        56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,  66.,
        67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,  77.,
        78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,
        89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99.,
       

In [99]:
grid_search1.best_estimator_

Lasso(alpha=4.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [100]:
pred_sal_lasso = grid_search1.predict(x_train2)

In [101]:
pred_sal_lasso

array([2755.32694601, 1455.41111522, 5272.58564976, ..., 1988.81162476,
       3627.53568223, 1854.61263407])

In [102]:
mean_absolute_error(y_train2,pred_sal_lasso)

796.4778559570784

## Using Lasso on entire training data.

In [103]:
lambdas1 = np.linspace(0.1,5,50)
params1 = {'alpha':lambdas}
model1 = Lasso(fit_intercept = True)

In [104]:
grid_search1 = GridSearchCV(model1, param_grid = params1, cv=10, n_jobs=-1, verbose=10, scoring = 'neg_mean_absolute_error')

In [105]:
grid_search1.fit(x_train,y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0690s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 100 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 228 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 266 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 346 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 388 ta

GridSearchCV(cv=10, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  2...
        34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,  44.,
        45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,
        56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,  66.,
        67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,  77.,
        78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,
        89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99.,
       

In [106]:
grid_search1.best_estimator_

Lasso(alpha=4.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [107]:
grid_search1.cv_results_

{'mean_fit_time': array([0.05872338, 0.04707968, 0.0397516 , 0.037009  , 0.03786418,
        0.03415349, 0.03199456, 0.02823634, 0.02534144, 0.02602472,
        0.02639263, 0.02470975, 0.02177641, 0.02467759, 0.02559276,
        0.02748766, 0.02965436, 0.02844572, 0.02791889, 0.0250227 ,
        0.02354755, 0.02327409, 0.02089458, 0.02224684, 0.02148178,
        0.02101209, 0.02020586, 0.02036302, 0.02057335, 0.01966794,
        0.02116914, 0.02039666, 0.02234921, 0.02109008, 0.01970553,
        0.02130117, 0.02131183, 0.02013111, 0.01978509, 0.01945434,
        0.02031784, 0.01990025, 0.01901371, 0.02047677, 0.01877382,
        0.01961937, 0.02119267, 0.01926775, 0.02007802, 0.02060802,
        0.02002287, 0.01929574, 0.01842787, 0.01765027, 0.01871502,
        0.01965337, 0.02006934, 0.0194241 , 0.02142007, 0.01996117,
        0.01875572, 0.02019839, 0.01971803, 0.01885428, 0.02069831,
        0.02089276, 0.01984696, 0.02074292, 0.02025695, 0.02009115,
        0.01985507, 0.01933334,

In [108]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3E} (std: {1:.3})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [109]:
report(grid_search1.cv_results_,100)

Model with rank: 1
Mean validation score: -7.896E+02 (std: 23.3)
Parameters: {'alpha': 4.0}

Model with rank: 2
Mean validation score: -7.897E+02 (std: 23.4)
Parameters: {'alpha': 3.0}

Model with rank: 3
Mean validation score: -7.897E+02 (std: 23.2)
Parameters: {'alpha': 5.0}

Model with rank: 4
Mean validation score: -7.898E+02 (std: 23.2)
Parameters: {'alpha': 6.0}

Model with rank: 5
Mean validation score: -7.899E+02 (std: 23.2)
Parameters: {'alpha': 7.0}

Model with rank: 6
Mean validation score: -7.899E+02 (std: 23.6)
Parameters: {'alpha': 2.0}

Model with rank: 7
Mean validation score: -7.900E+02 (std: 23.2)
Parameters: {'alpha': 8.0}

Model with rank: 8
Mean validation score: -7.900E+02 (std: 23.3)
Parameters: {'alpha': 9.0}

Model with rank: 9
Mean validation score: -7.902E+02 (std: 23.4)
Parameters: {'alpha': 10.0}

Model with rank: 10
Mean validation score: -7.904E+02 (std: 23.5)
Parameters: {'alpha': 11.0}

Model with rank: 11
Mean validation score: -7.905E+02 (std: 24.1)
P

In [110]:
final_sal_lasso = grid_search1.predict(test_data)

In [111]:
pd.DataFrame(final_sal_lasso).to_csv('project3_LassoReg.csv',index=False)

## COFFECIENTS IN LASSO

In [112]:
lasso_model = grid_search1.best_estimator_

In [113]:
lasso_model.fit(x_train,y_train)

Lasso(alpha=4.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [114]:
list(zip(x_train.columns,lasso_model.coef_))

[('Area_dist_level_Medium', 45.29744354403235),
 ('Area_dist_level_Small', 0.0),
 ('Area_dist_level_Unknown', -0.0),
 ('Area_City_Type_Tier 2', 43.16071015146733),
 ('Area_City_Type_Tier 3', -1.8602669018443037),
 ('Area_Type_DownTown', 247.18067206083003),
 ('Area_Type_Industrial', -21.74495790478251),
 ('Area_Type_MidTownResidential', -1618.527016848268),
 ('SidEffect_Level_mild', -3.885630932146658),
 ('Medicine_Type_Antacids', 0.0),
 ('Medicine_Type_Antibiotics', 4.025183218120747),
 ('Medicine_Type_Antifungal', 0.0),
 ('Medicine_Type_Antimalarial', -0.0),
 ('Medicine_Type_Antipyretics', -72.15282013315415),
 ('Medicine_Type_Antiseptics', -0.0),
 ('Medicine_Type_Antiviral', 0.0),
 ('Medicine_Type_Cardiac', 1.2615337842639895),
 ('Medicine_Type_Hreplacements', -0.0),
 ('Medicine_Type_Mstablizers', 0.0),
 ('Medicine_Type_MuscleRelaxants', 0.0),
 ('Medicine_Type_OralContraceptives', -0.0),
 ('Medicine_Type_Statins', 0.0),
 ('Medicine_Type_Stimulants', 0.0),
 ('Medicine_Type_Tranquiliz

In [115]:
grid_search1.best_score_

-789.6312257023253

In [116]:
grid_search1.best_params_

{'alpha': 4.0}