# Data Science Regression Project for Nutraceuticals Products

In [49]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from pandas.core.reshape.merge import merge

# Data Load: Load DSLD product details into dataframe

In [50]:
df1 = pd.read_csv('DietarySupplementFacts_1.csv')
df1.head(1)

Unnamed: 0,URL,DSLD ID,Product Name,Serving Size,Ingredient,DSLD Ingredient Categories,Amount Per Serving,Amount Per Serving Unit,% Daily Value per Serving,Daily Value Target Group
0,https://dsld.od.nih.gov/label/542,542,B-2 100 mg,1 Tablet(s),Adults and children 4 or more years of age,Riboflavin,vitamin,100,mg,5882


In [51]:
df1['Serving Size'].unique()

array(['1 Tablet(s)', '1 Caplet(s)', '2 Tablet(s)', ..., '0.125 Oz(s)',
       '0.0625 Oz(s)', '1 Level Teaspoon(s)'], dtype=object)

In [52]:
df1['DSLD Ingredient Categories'].nunique()

28193

In [53]:
df1['Amount Per Serving'].nunique()

18

In [54]:
df2 = pd.read_csv('ProductOverview_1.csv')
df2.head(2)

Unnamed: 0,URL,DSLD ID,Product Name,Brand Name,Bar Code,Net Contents,Serving Size,Product Type [LanguaL],Supplement Form [LanguaL],Date Entered into DSLD,Market Status,Suggested Use
0,https://dsld.od.nih.gov/label/542,542,B-2 100 mg,Vitamin World,0 74312 70640 0,100 Easy To Swallow Coated Tablet(s),1 Tablet(s),Vitamin [A1302],Tablet or Pill [E0155],2011-11-25,On Market,DIRECTIONS: For adults; take one (1) to two (2...
1,https://dsld.od.nih.gov/label/543,543,B-6 100 mg,Vitamin World,0 74312 70653 0,250 Tablet(s),1 Tablet(s),Vitamin [A1302],Tablet or Pill [E0155],2011-11-25,Off Market,DIRECTIONS: For adults; take one (1) tablet da...


In [55]:
df2['Net Contents'].unique()

array(['100 Easy To Swallow Coated Tablet(s)', '250 Tablet(s)',
       '500 Coated Caplet(s)', ..., '0.88 lbs; 400 Gram(s)',
       '2.15 lbs; 975 Gram(s)', '4.5 lbs; 2040 Gram(s)'], dtype=object)

In [56]:
df2['Serving Size'].nunique()

3355

In [57]:
df2['Product Type [LanguaL]'].nunique()

11

In [58]:
df2['Product Type [LanguaL]'].unique()

array(['Vitamin [A1302]', 'Other Combinations [A1325]',
       'Non-Nutrient/Non-Botanical [A1309]', 'Botanical [A1306]',
       'Botanical with Nutrients [A1317]', 'Mineral [A1299]',
       'Amino acid/Protein [A1305]',
       'Multi-Vitamin and Mineral (MVM) [A1315]',
       'Fat/Fatty Acid [A1310]', 'Single Vitamin and Mineral [A1316]',
       'Fiber and Other Nutrients [A1326]'], dtype=object)

In [59]:
df2['Supplement Form [LanguaL]'].nunique()

10

In [60]:
df2['Supplement Form [LanguaL]'].unique()

array(['Tablet or Pill [E0155]', 'Other (e.g. tea bag) [E0172]',
       'Capsule [E0159]', 'Liquid [E0165]', 'Powder [E0162]',
       'Softgel Capsule [E0161]', 'Lozenge [E0174]', 'Unknown [E0177]',
       'Gummy or Jelly [E0176]', 'Bar [E0164]'], dtype=object)

In [61]:
df2['Suggested Use'].nunique()

22032

In [62]:
df1.shape

(407151, 10)

In [16]:
df2.shape

(40000, 12)

In [63]:
df40 = df2.merge(df1, on='DSLD ID')

In [64]:
df40.shape

(407151, 21)

In [65]:
df40.isna().sum()

URL_x                              0
DSLD ID                            0
Product Name_x                     0
Brand Name                         0
Bar Code                       76669
Net Contents                    3092
Serving Size_x                   208
Product Type [LanguaL]             0
Supplement Form [LanguaL]          0
Date Entered into DSLD             0
Market Status                      0
Suggested Use                  10260
URL_y                              0
Product Name_y                     0
Serving Size_y                     0
Ingredient                    110800
DSLD Ingredient Categories         0
Amount Per Serving                 0
Amount Per Serving Unit       102573
% Daily Value per Serving      93616
Daily Value Target Group      248445
dtype: int64

In [67]:
df41 = df40[['DSLD Ingredient Categories','Amount Per Serving','Amount Per Serving Unit','% Daily Value per Serving','Daily Value Target Group','Product Type [LanguaL]','Supplement Form [LanguaL]']]

In [68]:
df41

Unnamed: 0,DSLD Ingredient Categories,Amount Per Serving,Amount Per Serving Unit,% Daily Value per Serving,Daily Value Target Group,Product Type [LanguaL],Supplement Form [LanguaL]
0,Riboflavin,vitamin,100,mg,5882,Vitamin [A1302],Tablet or Pill [E0155]
1,Vitamin B6,vitamin,100,mg,5000,Vitamin [A1302],Tablet or Pill [E0155]
2,Vitamin C,vitamin,1000,mg,1667,Other Combinations [A1325],Other (e.g. tea bag) [E0172]
3,Citrus Bioflavonoids,non-nutrient/non-botanical,25,mg,,Other Combinations [A1325],Other (e.g. tea bag) [E0172]
4,Rose Hips,botanical,20,mg,,Other Combinations [A1325],Other (e.g. tea bag) [E0172]
...,...,...,...,...,...,...,...
407146,Dietary Fiber,fiber,,Gram(s),,Other Combinations [A1325],Powder [E0162]
407147,Sugar,sugar,2,Gram(s),,Other Combinations [A1325],Powder [E0162]
407148,Protein,protein,25,Gram(s),50,Other Combinations [A1325],Powder [E0162]
407149,Calcium,mineral,300,mg,30,Other Combinations [A1325],Powder [E0162]


# Data Cleaning: Handle NA values

In [69]:
df41.isnull().sum()

DSLD Ingredient Categories         0
Amount Per Serving                 0
Amount Per Serving Unit       102573
% Daily Value per Serving      93616
Daily Value Target Group      248445
Product Type [LanguaL]             0
Supplement Form [LanguaL]          0
dtype: int64

In [70]:
df41.dropna(how="all")

Unnamed: 0,DSLD Ingredient Categories,Amount Per Serving,Amount Per Serving Unit,% Daily Value per Serving,Daily Value Target Group,Product Type [LanguaL],Supplement Form [LanguaL]
0,Riboflavin,vitamin,100,mg,5882,Vitamin [A1302],Tablet or Pill [E0155]
1,Vitamin B6,vitamin,100,mg,5000,Vitamin [A1302],Tablet or Pill [E0155]
2,Vitamin C,vitamin,1000,mg,1667,Other Combinations [A1325],Other (e.g. tea bag) [E0172]
3,Citrus Bioflavonoids,non-nutrient/non-botanical,25,mg,,Other Combinations [A1325],Other (e.g. tea bag) [E0172]
4,Rose Hips,botanical,20,mg,,Other Combinations [A1325],Other (e.g. tea bag) [E0172]
...,...,...,...,...,...,...,...
407146,Dietary Fiber,fiber,,Gram(s),,Other Combinations [A1325],Powder [E0162]
407147,Sugar,sugar,2,Gram(s),,Other Combinations [A1325],Powder [E0162]
407148,Protein,protein,25,Gram(s),50,Other Combinations [A1325],Powder [E0162]
407149,Calcium,mineral,300,mg,30,Other Combinations [A1325],Powder [E0162]


In [71]:
df41.dtypes

DSLD Ingredient Categories    object
Amount Per Serving            object
Amount Per Serving Unit       object
% Daily Value per Serving     object
Daily Value Target Group      object
Product Type [LanguaL]        object
Supplement Form [LanguaL]     object
dtype: object

In [25]:
df6 = df4.fillna({
    'Amount Per Serving Unit': 0,
    'Daily Value Target Group': 0,
    '% Daily Value per Serving': 'others',
})
df6

In [72]:
df42 = df41.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

In [73]:
df42.shape

(152184, 7)

In [74]:
df42.dtypes


DSLD Ingredient Categories    object
Amount Per Serving            object
Amount Per Serving Unit       object
% Daily Value per Serving     object
Daily Value Target Group      object
Product Type [LanguaL]        object
Supplement Form [LanguaL]     object
dtype: object

In [76]:
df42['Daily Value Target Group'] = pd.to_numeric(df42['Daily Value Target Group'], errors='coerce')

# Feature Engineering

In [77]:
df42.head(10)

Unnamed: 0,DSLD Ingredient Categories,Amount Per Serving,Amount Per Serving Unit,% Daily Value per Serving,Daily Value Target Group,Product Type [LanguaL],Supplement Form [LanguaL]
0,Riboflavin,vitamin,100.0,mg,5882.0,Vitamin [A1302],Tablet or Pill [E0155]
1,Vitamin B6,vitamin,100.0,mg,5000.0,Vitamin [A1302],Tablet or Pill [E0155]
2,Vitamin C,vitamin,1000.0,mg,1667.0,Other Combinations [A1325],Other (e.g. tea bag) [E0172]
5,Niacin,vitamin,250.0,mg,1250.0,Vitamin [A1302],Tablet or Pill [E0155]
7,Thiamine,vitamin,0.12,mg,8.0,Other Combinations [A1325],Tablet or Pill [E0155]
8,Riboflavin,vitamin,0.04,mg,2.0,Other Combinations [A1325],Tablet or Pill [E0155]
9,Niacin,vitamin,0.3,mg,2.0,Other Combinations [A1325],Tablet or Pill [E0155]
11,Thiamine,vitamin,100.0,mg,6667.0,Other Combinations [A1325],Other (e.g. tea bag) [E0172]
12,Riboflavin,vitamin,100.0,mg,5882.0,Other Combinations [A1325],Other (e.g. tea bag) [E0172]
13,Niacin,vitamin,100.0,mg,500.0,Other Combinations [A1325],Other (e.g. tea bag) [E0172]


In [79]:
df42.head(1)


Unnamed: 0,DSLD Ingredient Categories,Amount Per Serving,Amount Per Serving Unit,% Daily Value per Serving,Daily Value Target Group,Product Type [LanguaL],Supplement Form [LanguaL]
0,Riboflavin,vitamin,100,mg,5882.0,Vitamin [A1302],Tablet or Pill [E0155]


In [80]:
df42.dtypes

DSLD Ingredient Categories     object
Amount Per Serving             object
Amount Per Serving Unit        object
% Daily Value per Serving      object
Daily Value Target Group      float64
Product Type [LanguaL]         object
Supplement Form [LanguaL]      object
dtype: object

In [81]:
df42.rename(columns = {'Daily Value Target Group':'DVTG'}, inplace = True)
df42.rename(columns = {'DSLD Ingredient Categories':'DSLDIC'}, inplace = True)

In [82]:
df42.shape

(152184, 7)

In [83]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('DSLDIC'):
        m = np.mean(subdf.DVTG)
        st = np.std(subdf.DVTG)
        reduced_df = subdf[(subdf.DVTG>(m-st)) & (subdf.DVTG<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

df43 = remove_pps_outliers(df42)
df43.shape

(133911, 7)

In [85]:
df43.head(1)

Unnamed: 0,DSLDIC,Amount Per Serving,Amount Per Serving Unit,% Daily Value per Serving,DVTG,Product Type [LanguaL],Supplement Form [LanguaL]
0,5-MTHF,vitamin,1,mg,250.0,Vitamin [A1302],Capsule [E0159]


In [86]:
df43['Amount Per Serving Unit'] = df43['Amount Per Serving Unit'].str.replace('<', '')
df43['Amount Per Serving Unit'] = df43['Amount Per Serving Unit'].str.replace('.', '')

In [87]:
df43.dtypes

DSLDIC                        object
Amount Per Serving            object
Amount Per Serving Unit       object
% Daily Value per Serving     object
DVTG                         float64
Product Type [LanguaL]        object
Supplement Form [LanguaL]     object
dtype: object

In [88]:
df44 = df43.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

In [89]:
df44.shape

(133911, 7)

In [90]:
df44.dtypes

DSLDIC                        object
Amount Per Serving            object
Amount Per Serving Unit       object
% Daily Value per Serving     object
DVTG                         float64
Product Type [LanguaL]        object
Supplement Form [LanguaL]     object
dtype: object

In [91]:
df44['Amount Per Serving Unit'] = df44['Amount Per Serving Unit'].astype('int64')

In [92]:
df44.dtypes

DSLDIC                        object
Amount Per Serving            object
Amount Per Serving Unit        int64
% Daily Value per Serving     object
DVTG                         float64
Product Type [LanguaL]        object
Supplement Form [LanguaL]     object
dtype: object

In [93]:
df44.rename(columns = {'Amount Per Serving':'APS'}, inplace = True)
df44.rename(columns = {'Amount Per Serving Unit':'APSU'}, inplace = True)
df44.rename(columns = {'% Daily Value per Serving':'%DVPS'}, inplace = True)
df44.rename(columns = {'Product Type [LanguaL]':'PTYPE'}, inplace = True)
df44.rename(columns = {'Supplement Form [LanguaL]':'SFORM'}, inplace = True)

In [94]:
df44.head(2)

Unnamed: 0,DSLDIC,APS,APSU,%DVPS,DVTG,PTYPE,SFORM
0,5-MTHF,vitamin,1,mg,250.0,Vitamin [A1302],Capsule [E0159]
1,Alpha-Linolenic Acid,fatty acid,33,g,206.0,Fat/Fatty Acid [A1310],Softgel Capsule [E0161]


In [95]:
df44.dtypes

DSLDIC     object
APS        object
APSU        int64
%DVPS      object
DVTG      float64
PTYPE      object
SFORM      object
dtype: object

# Dimensionality Reduction and Outlier Removal Using Standard Deviation and Mean

In [96]:
def remove_apsu_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('DSLDIC'):
        m = np.mean(subdf.DVTG)
        st = np.std(subdf.DVTG)
        reduced_df = subdf[(subdf.APSU>(m-st)) & (subdf.APSU<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

df45 = remove_pps_outliers(df44)
df45.shape

(105137, 7)

In [97]:
df45

Unnamed: 0,DSLDIC,APS,APSU,%DVPS,DVTG,PTYPE,SFORM
0,Alpha-Tocopherol,vitamin,120,IU,400.0,Other Combinations [A1325],Capsule [E0159]
1,Alpha-Tocopherol,vitamin,120,IU,400.0,Other Combinations [A1325],Capsule [E0159]
2,Alpha-Tocopherol,vitamin,120,IU,400.0,Other Combinations [A1325],Capsule [E0159]
3,Alpha-Tocopherol,vitamin,120,IU,400.0,Other Combinations [A1325],Capsule [E0159]
4,Alpha-Tocopherol,vitamin,80,mg,400.0,Other Combinations [A1325],Capsule [E0159]
...,...,...,...,...,...,...,...
105132,natural MK-7,vitamin,90,mcg,112.0,Other Combinations [A1325],Tablet or Pill [E0155]
105133,natural MK-7,vitamin,90,mcg,112.0,Other Combinations [A1325],Tablet or Pill [E0155]
105134,natural MK-7,vitamin,90,mcg,112.0,Other Combinations [A1325],Tablet or Pill [E0155]
105135,natural MK-7,vitamin,90,mcg,112.0,Other Combinations [A1325],Tablet or Pill [E0155]


In [98]:
df45 = df45.drop_duplicates()

In [99]:
df45

Unnamed: 0,DSLDIC,APS,APSU,%DVPS,DVTG,PTYPE,SFORM
0,Alpha-Tocopherol,vitamin,120,IU,400.0,Other Combinations [A1325],Capsule [E0159]
4,Alpha-Tocopherol,vitamin,80,mg,400.0,Other Combinations [A1325],Capsule [E0159]
22,Ascorbic Acid,vitamin,500,mg,833.0,Multi-Vitamin and Mineral (MVM) [A1315],Capsule [E0159]
23,Ascorbic Acid,vitamin,300,mg,500.0,Non-Nutrient/Non-Botanical [A1309],Capsule [E0159]
24,Ascorbic Acid,vitamin,500,mg,833.0,Other Combinations [A1325],Tablet or Pill [E0155]
...,...,...,...,...,...,...,...
105128,Zinc Picolinate,non-nutrient/non-botanical,30,mg,40.0,Other Combinations [A1325],Capsule [E0159]
105129,as Calcium Ascorbate,non-nutrient/non-botanical,500,mg,833.0,Botanical with Nutrients [A1317],Capsule [E0159]
105130,as Calcium Ascorbate,non-nutrient/non-botanical,200,mg,333.0,Botanical with Nutrients [A1317],Capsule [E0159]
105131,as Calcium Ascorbate,non-nutrient/non-botanical,500,mg,833.0,Other Combinations [A1325],Capsule [E0159]


In [100]:
df45.APS.unique()

array(['vitamin', 'mineral', 'non-nutrient/non-botanical', 'other',
       'sugar', 'fat', 'fiber', 'fatty acid', 'blend', 'botanical',
       'protein'], dtype=object)

In [101]:
aps = {
    'vitamin': 75,
    'mineral': 60,
    'non-nutrient/non-botanical': 20,
    'other': 10,
       'sugar': 40, 
    'fat': 30,
    'fiber': 25,
    'fatty acid': 60,
    'blend': 20,
    'botanical': 30,
       'protein': 60
}

In [103]:
df45.APS = [aps[item] for item in df45.APS]

In [104]:
df45.head(2)

Unnamed: 0,DSLDIC,APS,APSU,%DVPS,DVTG,PTYPE,SFORM
0,Alpha-Tocopherol,75,120,IU,400.0,Other Combinations [A1325],Capsule [E0159]
4,Alpha-Tocopherol,75,80,mg,400.0,Other Combinations [A1325],Capsule [E0159]


In [105]:
df45.rename(columns = {'%DVPS':'UNIT'}, inplace = True)

In [106]:
df45.head(1)

Unnamed: 0,DSLDIC,APS,APSU,UNIT,DVTG,PTYPE,SFORM
0,Alpha-Tocopherol,75,120,IU,400.0,Other Combinations [A1325],Capsule [E0159]


In [107]:
df45.rename(columns = {'DSLDIC':'dsldic'}, inplace = True)
df45.rename(columns = {'APS':'aps'}, inplace = True)
df45.rename(columns = {'APSU':'apsu'}, inplace = True)
df45.rename(columns = {'UNIT':'unit'}, inplace = True)
df45.rename(columns = {'DVTG':'dvtg'}, inplace = True)
df45.rename(columns = {'PTYPE':'ptype'}, inplace = True)
df45.rename(columns = {'SFORM':'sform'}, inplace = True)

In [108]:
df45.head(2)

Unnamed: 0,dsldic,aps,apsu,unit,dvtg,ptype,sform
0,Alpha-Tocopherol,75,120,IU,400.0,Other Combinations [A1325],Capsule [E0159]
4,Alpha-Tocopherol,75,80,mg,400.0,Other Combinations [A1325],Capsule [E0159]


In [109]:
df45.unit.unique()

array(['IU', 'mg', 'mcg', 'mgc', 'ug', 'Nordic Berry(ies)', 'None', 'g',
       'mg AT', 'Gram(s)', '{Calories}', 'Kcal', 'ng', 'gm', 'Grain(s)',
       'mC Unit(s)', 'CG', 'NP Capsule(s)', 'IAU', 'mg NE', 'mg E', 'mmg',
       'grams', 'U', 'IU mg', 'UI', 'mcg RE', 'mgs', 'mg {alpha}-TE'],
      dtype=object)

In [110]:
Unit = {
    'IU': 1,
    'mg': 2,
    'mcg': 3,
    'mgc': 4,
    'ug': 5,
    'Nordic Berry(ies)': 6,
    'None': 32,
    'g': 7,
       'mg AT': 8,
    'others': 33,
    'Gram(s)': 9,
    'ng': 10,
    '{Calories}': 11,
    'Calorie(s)': 12,
       'Kcal': 13,
    'gm': 14,
    'Grain(s)': 15,
    'mC Unit(s)': 16,
    'CG': 17,
    'NP Capsule(s)': 18,
       'IAU': 19,
    'mg NE': 20,
    'mg E': 21,
    'mmg': 22,
    'm.c.u.': 23,
    'grams': 24,
    'U': 25,
    'IU mg': 26,
           'UI': 27,
    'mcg RE': 28,
    'mg RE': 29,
    'mgs': 30,
    'mg {alpha}-TE': 31
}

In [111]:
df45.unit = [Unit[item] for item in df45.unit]

In [112]:
df45

Unnamed: 0,dsldic,aps,apsu,unit,dvtg,ptype,sform
0,Alpha-Tocopherol,75,120,1,400.0,Other Combinations [A1325],Capsule [E0159]
4,Alpha-Tocopherol,75,80,2,400.0,Other Combinations [A1325],Capsule [E0159]
22,Ascorbic Acid,75,500,2,833.0,Multi-Vitamin and Mineral (MVM) [A1315],Capsule [E0159]
23,Ascorbic Acid,75,300,2,500.0,Non-Nutrient/Non-Botanical [A1309],Capsule [E0159]
24,Ascorbic Acid,75,500,2,833.0,Other Combinations [A1325],Tablet or Pill [E0155]
...,...,...,...,...,...,...,...
105128,Zinc Picolinate,20,30,2,40.0,Other Combinations [A1325],Capsule [E0159]
105129,as Calcium Ascorbate,20,500,2,833.0,Botanical with Nutrients [A1317],Capsule [E0159]
105130,as Calcium Ascorbate,20,200,2,333.0,Botanical with Nutrients [A1317],Capsule [E0159]
105131,as Calcium Ascorbate,20,500,2,833.0,Other Combinations [A1325],Capsule [E0159]


In [113]:
df45.ptype.unique()

array(['Other Combinations [A1325]',
       'Multi-Vitamin and Mineral (MVM) [A1315]',
       'Non-Nutrient/Non-Botanical [A1309]', 'Botanical [A1306]',
       'Botanical with Nutrients [A1317]', 'Vitamin [A1302]',
       'Amino acid/Protein [A1305]', 'Fiber and Other Nutrients [A1326]',
       'Mineral [A1299]', 'Single Vitamin and Mineral [A1316]',
       'Fat/Fatty Acid [A1310]'], dtype=object)

In [115]:
df45 = df45.drop_duplicates()

In [116]:
df45

Unnamed: 0,dsldic,aps,apsu,unit,dvtg,ptype,sform
0,Alpha-Tocopherol,75,120,1,400.0,Other Combinations [A1325],Capsule [E0159]
4,Alpha-Tocopherol,75,80,2,400.0,Other Combinations [A1325],Capsule [E0159]
22,Ascorbic Acid,75,500,2,833.0,Multi-Vitamin and Mineral (MVM) [A1315],Capsule [E0159]
23,Ascorbic Acid,75,300,2,500.0,Non-Nutrient/Non-Botanical [A1309],Capsule [E0159]
24,Ascorbic Acid,75,500,2,833.0,Other Combinations [A1325],Tablet or Pill [E0155]
...,...,...,...,...,...,...,...
105128,Zinc Picolinate,20,30,2,40.0,Other Combinations [A1325],Capsule [E0159]
105129,as Calcium Ascorbate,20,500,2,833.0,Botanical with Nutrients [A1317],Capsule [E0159]
105130,as Calcium Ascorbate,20,200,2,333.0,Botanical with Nutrients [A1317],Capsule [E0159]
105131,as Calcium Ascorbate,20,500,2,833.0,Other Combinations [A1325],Capsule [E0159]


In [117]:
df45.rename(columns = {'%DVPS':'UNIT'}, inplace = True)
df45.rename(columns = {'DSLDIC':'dsldic'}, inplace = True)
df45.rename(columns = {'APS':'aps'}, inplace = True)
df45.rename(columns = {'APSU':'apsu'}, inplace = True)
df45.rename(columns = {'UNIT':'unit'}, inplace = True)
df45.rename(columns = {'DVTG':'dvtg'}, inplace = True)
df45.rename(columns = {'PTYPE':'ptype'}, inplace = True)
df45.rename(columns = {'SFORM':'sform'}, inplace = True)

In [118]:
df45.head(1)

Unnamed: 0,dsldic,aps,apsu,unit,dvtg,ptype,sform
0,Alpha-Tocopherol,75,120,1,400.0,Other Combinations [A1325],Capsule [E0159]


In [119]:
df45.aps.unique()

array([75, 60, 20, 10, 40, 30, 25], dtype=int64)

In [120]:
Aps = {
    'vitamin': 75,
    'mineral': 60,
    'non-nutrient/non-botanical': 20,
    'other': 10,
       'sugar': 40, 
    'fat': 30,
    'fiber': 25,
    'fatty acid': 60,
    'blend': 20,
    'botanical': 30,
       'protein': 60
}

In [122]:
df46= df45.copy()

df46.head()

Unnamed: 0,dsldic,aps,apsu,unit,dvtg,ptype,sform
0,Alpha-Tocopherol,75,120,1,400.0,Other Combinations [A1325],Capsule [E0159]
4,Alpha-Tocopherol,75,80,2,400.0,Other Combinations [A1325],Capsule [E0159]
22,Ascorbic Acid,75,500,2,833.0,Multi-Vitamin and Mineral (MVM) [A1315],Capsule [E0159]
23,Ascorbic Acid,75,300,2,500.0,Non-Nutrient/Non-Botanical [A1309],Capsule [E0159]
24,Ascorbic Acid,75,500,2,833.0,Other Combinations [A1325],Tablet or Pill [E0155]


In [123]:
df47 = df46[['apsu','aps','dvtg','dsldic','ptype','sform','unit']]

In [124]:
df47.head(1)

Unnamed: 0,apsu,aps,dvtg,dsldic,ptype,sform,unit
0,120,75,400.0,Alpha-Tocopherol,Other Combinations [A1325],Capsule [E0159],1


# Use One Hot Encoding

In [125]:
dummies = pd.get_dummies(df47.dsldic)
dummies.head(3)

Unnamed: 0,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,Beta-Carotene,Biotin,Boron,CHROMIUM,COPPER,Calcium,Calcium Carbonate,...,Vitamin K1,Vitamin K2,Yodo,Zinc,Zinc Citrate,Zinc Gluconate,Zinc Mono-L-Methionine Sulfate,Zinc Picolinate,as Calcium Ascorbate,natural MK-7
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [126]:
df48 = pd.concat([df47,dummies.drop('Yodo',axis='columns')],axis='columns')
df48.head(2)

Unnamed: 0,apsu,aps,dvtg,dsldic,ptype,sform,unit,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,...,Vitamin K-2,Vitamin K1,Vitamin K2,Zinc,Zinc Citrate,Zinc Gluconate,Zinc Mono-L-Methionine Sulfate,Zinc Picolinate,as Calcium Ascorbate,natural MK-7
0,120,75,400.0,Alpha-Tocopherol,Other Combinations [A1325],Capsule [E0159],1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,80,75,400.0,Alpha-Tocopherol,Other Combinations [A1325],Capsule [E0159],2,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [127]:
df49 = df48.drop('dsldic', axis='columns')
df49.head(2)

Unnamed: 0,apsu,aps,dvtg,ptype,sform,unit,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,Beta-Carotene,...,Vitamin K-2,Vitamin K1,Vitamin K2,Zinc,Zinc Citrate,Zinc Gluconate,Zinc Mono-L-Methionine Sulfate,Zinc Picolinate,as Calcium Ascorbate,natural MK-7
0,120,75,400.0,Other Combinations [A1325],Capsule [E0159],1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,80,75,400.0,Other Combinations [A1325],Capsule [E0159],2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [128]:
dummies = pd.get_dummies(df49.ptype)
dummies.head(3)

Unnamed: 0,Amino acid/Protein [A1305],Botanical [A1306],Botanical with Nutrients [A1317],Fat/Fatty Acid [A1310],Fiber and Other Nutrients [A1326],Mineral [A1299],Multi-Vitamin and Mineral (MVM) [A1315],Non-Nutrient/Non-Botanical [A1309],Other Combinations [A1325],Single Vitamin and Mineral [A1316],Vitamin [A1302]
0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,1,0,0
22,0,0,0,0,0,0,1,0,0,0,0


In [129]:
df50 = pd.concat([df49,dummies.drop('Other Combinations [A1325]',axis='columns')],axis='columns')
df50.head(2)

Unnamed: 0,apsu,aps,dvtg,ptype,sform,unit,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,Beta-Carotene,...,Amino acid/Protein [A1305],Botanical [A1306],Botanical with Nutrients [A1317],Fat/Fatty Acid [A1310],Fiber and Other Nutrients [A1326],Mineral [A1299],Multi-Vitamin and Mineral (MVM) [A1315],Non-Nutrient/Non-Botanical [A1309],Single Vitamin and Mineral [A1316],Vitamin [A1302]
0,120,75,400.0,Other Combinations [A1325],Capsule [E0159],1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,80,75,400.0,Other Combinations [A1325],Capsule [E0159],2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [130]:
df51 = df50.drop('ptype', axis='columns')
df51.head(2)

Unnamed: 0,apsu,aps,dvtg,sform,unit,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,Beta-Carotene,Biotin,...,Amino acid/Protein [A1305],Botanical [A1306],Botanical with Nutrients [A1317],Fat/Fatty Acid [A1310],Fiber and Other Nutrients [A1326],Mineral [A1299],Multi-Vitamin and Mineral (MVM) [A1315],Non-Nutrient/Non-Botanical [A1309],Single Vitamin and Mineral [A1316],Vitamin [A1302]
0,120,75,400.0,Capsule [E0159],1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,80,75,400.0,Capsule [E0159],2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [131]:
dummies = pd.get_dummies(df51.sform)
dummies.head(3)

Unnamed: 0,Bar [E0164],Capsule [E0159],Gummy or Jelly [E0176],Liquid [E0165],Lozenge [E0174],Other (e.g. tea bag) [E0172],Powder [E0162],Softgel Capsule [E0161],Tablet or Pill [E0155],Unknown [E0177]
0,0,1,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0
22,0,1,0,0,0,0,0,0,0,0


In [132]:
df52 = pd.concat([df51,dummies.drop('Unknown [E0177]',axis='columns')],axis='columns')
df52.head(2)

Unnamed: 0,apsu,aps,dvtg,sform,unit,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,Beta-Carotene,Biotin,...,Vitamin [A1302],Bar [E0164],Capsule [E0159],Gummy or Jelly [E0176],Liquid [E0165],Lozenge [E0174],Other (e.g. tea bag) [E0172],Powder [E0162],Softgel Capsule [E0161],Tablet or Pill [E0155]
0,120,75,400.0,Capsule [E0159],1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,80,75,400.0,Capsule [E0159],2,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [133]:
df53 = df52.drop('sform', axis='columns')
df53.head(2)

Unnamed: 0,apsu,aps,dvtg,unit,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,Beta-Carotene,Biotin,Boron,...,Vitamin [A1302],Bar [E0164],Capsule [E0159],Gummy or Jelly [E0176],Liquid [E0165],Lozenge [E0174],Other (e.g. tea bag) [E0172],Powder [E0162],Softgel Capsule [E0161],Tablet or Pill [E0155]
0,120,75,400.0,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,80,75,400.0,2,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [134]:
dummies = pd.get_dummies(df53.unit)
dummies.head(3)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,21,22,24,25,26,27,28,30,31,32
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [136]:
df54 = df53.drop('unit', axis='columns')
df54.head(2)

Unnamed: 0,apsu,aps,dvtg,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,Beta-Carotene,Biotin,Boron,CHROMIUM,...,Vitamin [A1302],Bar [E0164],Capsule [E0159],Gummy or Jelly [E0176],Liquid [E0165],Lozenge [E0174],Other (e.g. tea bag) [E0172],Powder [E0162],Softgel Capsule [E0161],Tablet or Pill [E0155]
0,120,75,400.0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,80,75,400.0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [137]:
df54.shape

(19052, 159)

In [138]:
X = df54.drop('apsu', axis='columns')
X.head()

Unnamed: 0,aps,dvtg,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,Beta-Carotene,Biotin,Boron,CHROMIUM,COPPER,...,Vitamin [A1302],Bar [E0164],Capsule [E0159],Gummy or Jelly [E0176],Liquid [E0165],Lozenge [E0174],Other (e.g. tea bag) [E0172],Powder [E0162],Softgel Capsule [E0161],Tablet or Pill [E0155]
0,75,400.0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,75,400.0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
22,75,833.0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
23,75,500.0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
24,75,833.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Build a Model Now...(approach 1)

In [139]:
y = df54.apsu
y.head()

0     120
4      80
22    500
23    300
24    500
Name: apsu, dtype: int64

In [140]:
len(y)

19052

In [231]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [232]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.34843551481124

In [233]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.44207395, 0.6076029 , 0.23836542, 0.27630059, 0.2742799 ])

In [234]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'fit_intercept': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.367735,{'fit_intercept': False}
1,lasso,0.358183,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.297967,"{'criterion': 'friedman_mse', 'splitter': 'ran..."


In [235]:
import pickle
pickle.dump(df25,open('Amount_per_serving_unit_model.pkl','wb'))

In [236]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("APSU_columns.json","w") as f:
    f.write(json.dumps(columns))

# Build a Model Now...(approach 2)

In [240]:
df13.head(1)

Unnamed: 0,dsldic,aps,apsu,unit,dvtg,ptype,sform
0,Alpha-Tocopherol,75,120,1,400,Other Combinations [A1325],Capsule [E0159]


In [238]:
df13.ptype.unique()

array(['Other Combinations [A1325]',
       'Multi-Vitamin and Mineral (MVM) [A1315]', 'Botanical [A1306]',
       'Botanical with Nutrients [A1317]', 'Vitamin [A1302]',
       'Amino acid/Protein [A1305]', 'Fiber and Other Nutrients [A1326]',
       'Mineral [A1299]', 'Non-Nutrient/Non-Botanical [A1309]',
       'Single Vitamin and Mineral [A1316]', 'Fat/Fatty Acid [A1310]'],
      dtype=object)

In [239]:
Ptype = {
    'Other Combinations [A1325]': 11,
       'Multi-Vitamin and Mineral (MVM) [A1315]': 1,
    'Botanical [A1306]': 2,
       'Botanical with Nutrients [A1317]': 3,
    'Vitamin [A1302]': 4,
       'Amino acid/Protein [A1305]': 5,
    'Fiber and Other Nutrients [A1326]': 6,
       'Mineral [A1299]': 7,
    'Non-Nutrient/Non-Botanical [A1309]': 8,
       'Single Vitamin and Mineral [A1316]': 9, 
    'Fat/Fatty Acid [A1310]': 10
}

In [241]:
df14.ptype = [Ptype[item] for item in df14.ptype]

In [243]:
df15 = df14.copy()

In [245]:
df15.head(2)

Unnamed: 0,dsldic,aps,apsu,unit,dvtg,ptype,sform
0,Alpha-Tocopherol,75,120,IU,400,11,Capsule [E0159]
4,Alpha-Tocopherol,75,80,mg,400,11,Capsule [E0159]


In [247]:
df15.sform.unique()

array(['Capsule [E0159]', 'Tablet or Pill [E0155]',
       'Softgel Capsule [E0161]', 'Other (e.g. tea bag) [E0172]',
       'Powder [E0162]', 'Liquid [E0165]', 'Unknown [E0177]',
       'Gummy or Jelly [E0176]', 'Bar [E0164]', 'Lozenge [E0174]'],
      dtype=object)

In [248]:
Sform = {
    'Capsule [E0159]': 1,
    'Tablet or Pill [E0155]': 2,
       'Softgel Capsule [E0161]': 3,
    'Other (e.g. tea bag) [E0172]': 4,
       'Powder [E0162]': 5,
    'Liquid [E0165]': 6,
    'Unknown [E0177]': 7,
       'Gummy or Jelly [E0176]': 8,
    'Bar [E0164]': 9,
    'Lozenge [E0174]': 10
}

In [249]:
df15.sform = [Sform[item] for item in df15.sform]

In [251]:
df15.head(2)

Unnamed: 0,dsldic,aps,apsu,unit,dvtg,ptype,sform
0,Alpha-Tocopherol,75,120,IU,400,11,1
4,Alpha-Tocopherol,75,80,mg,400,11,1


In [252]:
df15.unit.unique()

array(['IU', 'mg', 'mcg', 'mgc', 'ug', 'Nordic Berry(ies)', 'None', 'g',
       'mg AT', 'others', 'Gram(s)', 'ng', '{Calories}', 'Calorie(s)',
       'Kcal', 'gm', 'Grain(s)', 'mC Unit(s)', 'CG', 'NP Capsule(s)',
       'IAU', 'mg NE', 'mg E', 'mmg', 'm.c.u.', 'grams', 'U', 'IU mg',
       'UI', 'mcg RE', 'mg RE', 'mgs', 'mg {alpha}-TE'], dtype=object)

In [253]:
Unit = {
    'IU': 1,
    'mg': 2,
    'mcg': 3,
    'mgc': 4,
    'ug': 5,
    'Nordic Berry(ies)': 6,
    'None': 32,
    'g': 7,
       'mg AT': 8,
    'others': 33,
    'Gram(s)': 9,
    'ng': 10,
    '{Calories}': 11,
    'Calorie(s)': 12,
       'Kcal': 13,
    'gm': 14,
    'Grain(s)': 15,
    'mC Unit(s)': 16,
    'CG': 17,
    'NP Capsule(s)': 18,
       'IAU': 19,
    'mg NE': 20,
    'mg E': 21,
    'mmg': 22,
    'm.c.u.': 23,
    'grams': 24,
    'U': 25,
    'IU mg': 26,
           'UI': 27,
    'mcg RE': 28,
    'mg RE': 29,
    'mgs': 30,
    'mg {alpha}-TE': 31
}

In [254]:
df15.unit = [Unit[item] for item in df15.unit]

In [255]:
df15

Unnamed: 0,dsldic,aps,apsu,unit,dvtg,ptype,sform
0,Alpha-Tocopherol,75,120,1,400,11,1
4,Alpha-Tocopherol,75,80,2,400,11,1
22,Ascorbic Acid,75,500,2,833,1,1
23,Ascorbic Acid,75,500,2,833,11,2
24,Beta Carotene,75,1200,3,40,11,3
...,...,...,...,...,...,...,...
119952,Zinc Picolinate,20,30,2,40,11,1
119953,as Calcium Ascorbate,20,500,2,833,3,1
119954,as Calcium Ascorbate,20,200,2,333,3,1
119955,as Calcium Ascorbate,20,500,2,833,11,1


In [256]:
df26 = df15.copy()

In [258]:
df26.head(2)

Unnamed: 0,dsldic,aps,apsu,unit,dvtg,ptype,sform
0,Alpha-Tocopherol,75,120,1,400,11,1
4,Alpha-Tocopherol,75,80,2,400,11,1


In [259]:
dummies = pd.get_dummies(df26.dsldic)
dummies.head(3)

Unnamed: 0,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,Beta-Carotene,Biotin,Boron,CHROMIUM,COPPER,Calcium,Calcium Carbonate,...,Vitamin K1,Vitamin K2,Yodo,Zinc,Zinc Citrate,Zinc Gluconate,Zinc Mono-L-Methionine Sulfate,Zinc Picolinate,as Calcium Ascorbate,natural MK-7
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [260]:
df27 = pd.concat([df26,dummies.drop('Yodo',axis='columns')],axis='columns')
df27.head(2)

Unnamed: 0,dsldic,aps,apsu,unit,dvtg,ptype,sform,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,...,Vitamin K-2,Vitamin K1,Vitamin K2,Zinc,Zinc Citrate,Zinc Gluconate,Zinc Mono-L-Methionine Sulfate,Zinc Picolinate,as Calcium Ascorbate,natural MK-7
0,Alpha-Tocopherol,75,120,1,400,11,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Alpha-Tocopherol,75,80,2,400,11,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [261]:
df28 = df27.drop('dsldic', axis='columns')
df28.head(2)

Unnamed: 0,aps,apsu,unit,dvtg,ptype,sform,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,Beta-Carotene,...,Vitamin K-2,Vitamin K1,Vitamin K2,Zinc,Zinc Citrate,Zinc Gluconate,Zinc Mono-L-Methionine Sulfate,Zinc Picolinate,as Calcium Ascorbate,natural MK-7
0,75,120,1,400,11,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,75,80,2,400,11,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [270]:
df28.to_csv('Products1.csv', index=False)

In [262]:
df28.shape

(22442, 144)

In [263]:
X = df28.drop('apsu', axis='columns')
X.head()

Unnamed: 0,aps,unit,dvtg,ptype,sform,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,Beta-Carotene,Biotin,...,Vitamin K-2,Vitamin K1,Vitamin K2,Zinc,Zinc Citrate,Zinc Gluconate,Zinc Mono-L-Methionine Sulfate,Zinc Picolinate,as Calcium Ascorbate,natural MK-7
0,75,1,400,11,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,75,2,400,11,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,75,2,833,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23,75,2,833,11,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24,75,3,40,11,3,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [264]:
y = df28.apsu
y.head()

0      120
4       80
22     500
23     500
24    1200
Name: apsu, dtype: int64

In [265]:
len(y)

22442

In [266]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [267]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)b

0.3454756524700908

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from pandas.core.reshape.merge import merge

In [3]:
df33 = pd.read_csv('Products1.csv')
df33.head(2)

Unnamed: 0,aps,apsu,unit,dvtg,ptype,sform,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,Beta-Carotene,...,Vitamin K-2,Vitamin K1,Vitamin K2,Zinc,Zinc Citrate,Zinc Gluconate,Zinc Mono-L-Methionine Sulfate,Zinc Picolinate,as Calcium Ascorbate,natural MK-7
0,75,120,1,400,11,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,75,80,2,400,11,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df33.shape

(22442, 144)

In [6]:
X = df33.drop('dvtg', axis='columns')
X.head()

Unnamed: 0,aps,apsu,unit,ptype,sform,Alpha-Tocopherol,Ascorbic Acid,Beta Carotene,Beta-Carotene,Biotin,...,Vitamin K-2,Vitamin K1,Vitamin K2,Zinc,Zinc Citrate,Zinc Gluconate,Zinc Mono-L-Methionine Sulfate,Zinc Picolinate,as Calcium Ascorbate,natural MK-7
0,75,120,1,11,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,75,80,2,11,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,75,500,2,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,75,500,2,11,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,75,1200,3,11,3,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
y = df33.dvtg
y.head()

0    400
1    400
2    833
3    833
4     40
Name: dvtg, dtype: int64

In [8]:
len(y)

22442

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=10)

In [19]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.533558500242254

# Use K Fold cross validation to measure accuracy of our LinearRegression model

In [20]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.27262194, 0.47399182, 0.24027407, 0.31542192, 0.35942879])

# Find best model using GridSearchCV

In [21]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'fit_intercept': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.332348,{'fit_intercept': True}
1,lasso,0.312183,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.699426,"{'criterion': 'friedman_mse', 'splitter': 'best'}"


# Export the tested model to a pickle file

In [40]:
import pickle
pickle.dump(df33,open('Amount_per_serving_unit_regression _model.pickel','wb'))

# Export location and column information to a file that will be useful later on in our prediction application¶

In [41]:
import json
columns = {
    'data_columns' : [col.lower() for col in df33.columns]
}
with open("Neutraciticals_Product_columns.json","w") as f:
    f.write(json.dumps(columns))