## Reduce Dataset based on ingredient frequency
## This will find all ingredients below a certain frequency and remove the columns and all rows that use the ingredient from the dataset
## Requires the one hot encoded dataset

## Load the one hot encoded dataset

In [1]:
import pandas as pd
from numba import jit, int32, boolean, njit
from numba.typed import List

reducedDF = pd.read_feather("ingredientOneHot.fth").astype("boolean")
colNames = reducedDF.columns #save columns names
print(colNames)
print(reducedDF.head())

Index(['ig_389', 'ig_7655', 'ig_6270', 'ig_1527', 'ig_3406', 'ig_2683',
       'ig_4969', 'ig_800', 'ig_5298', 'ig_840',
       ...
       'ig_6145', 'ig_2522', 'ig_7605', 'ig_6195', 'ig_1681', 'ig_750',
       'ig_5474', 'ig_3845', 'ig_7980', 'ig_7069'],
      dtype='object', length=7993)
   ig_389  ig_7655  ig_6270  ig_1527  ig_3406  ig_2683  ig_4969  ig_800  \
0    True     True     True     True     True    False    False   False   
1   False    False     True    False    False     True     True    True   
2   False     True     True    False    False     True    False   False   
3   False    False    False    False    False    False    False   False   
4   False    False    False    False    False    False    False   False   

   ig_5298  ig_840  ...  ig_6145  ig_2522  ig_7605  ig_6195  ig_1681  ig_750  \
0    False   False  ...    False    False    False    False    False   False   
1     True    True  ...    False    False    False    False    False   False   
2    False   False

## Convert to numpy

In [2]:
import numpy as np

df_np = reducedDF.to_numpy().astype(np.bool_)
df_npT = df_np.T
print(df_np.shape)
print(df_np.dtype)


(178265, 7993)
bool


## Using numba to speed this up. It will be faster the second time these functions are called.

In [3]:
#returns all columns with a frequency less then freq
@njit(cache=True)
def findCols(freq, df_npT):
    index = List()
    for x in range(len(df_npT)):
        col = df_npT[x]
        if col.mean() <= freq / len(col):
            print(x)
            index.append(x)
    print("Found: " + str(len(index)))
    return index


In [4]:
#returns a reduced numpy array that has rows removed if they used on of the ingredients with frequency <= freq
@njit(cache=True)
def removeRows(index, df_np):
    test = df_np[np.where((df_np[:,index[0]] == False))]
    for i in index[1:]:
        test = test[np.where((test[:,i] == False))]
        print(i)
    
    print(test.shape)
    return test
    

In [5]:
freq = 100 #change this to adjust how many ingredient to delete. 
index = findCols(freq, df_npT) #find the ingredients with frequency <= freq
data = removeRows(index, df_np) #rows are removed now
data = np.delete(data, index, axis=1) #delete the columns as well

print(index)
print(data)

4
11
24
38
44
50
60
61
67
80
93
94
102
108
143
166
172
179
204
211
222
224
226
231
235
236
240
241
252
254
255
256
261
262
278
280
295
306
310
319
320
325
332
334
338
341
342
345
346
352
353
354
357
365
369
370
378
388
389
390
391
393
396
404
417
427
428
433
441
445
449
451
456
468
476
477
480
488
491
492
493
496
499
503
509
510
511
512
513
515
517
519
522
523
525
529
530
537
548
569
570
572
579
584
587
589
596
598
601
602
607
609
610
616
617
618
619
622
624
626
627
628
629
632
633
635
636
639
643
646
653
655
664
667
670
678
679
685
687
688
692
700
703
705
709
712
722
724
725
728
732
737
738
739
740
742
745
752
756
762
768
769
774
776
777
779
780
784
785
787
788
805
806
807
808
809
811
815
820
821
825
827
828
830
833
834
841
842
843
844
845
847
849
850
852
855
856
857
858
860
861
863
869
871
873
875
883
884
885
887
888
893
896
898
901
902
903
904
906
910
912
914
916
917
922
924
926
929
931
933
935
936
937
938
942
945
948
950
952
953
955
958
963
967
969
970
973
977
978
979
980
981
982
9

3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625


5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646


7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602


2343
2344
2346
2347
2348
2349
2351
2352
2353
2354
2355
2357
2359
2360
2361
2363
2364
2365
2366
2367
2368
2369
2371
2372
2373
2374
2376
2377
2378
2379
2380
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2403
2404
2405
2406
2407
2408
2410
2411
2412
2413
2414
2415
2416
2417
2418
2421
2422
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2462
2463
2465
2466
2467
2468
2469
2470
2471
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2501
2503
2504
2505
2506
2507
2508
2509
2511
2512
2513
2514
2515
2516
2517
2518
2520
2521
2522
2523
2524
2525
2526
2527
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2565
2566
2567


4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255


5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897


7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539


In [6]:
colNames2 = np.delete(np.array(colNames), index) #need to track columns names as well so we know the ingredient IDs

print(colNames2)
print(data.shape) #reduced shape of the dataset

['ig_389' 'ig_7655' 'ig_6270' ... 'ig_2457' 'ig_1253' 'ig_183']
(99112, 1315)


## Write the new dataset

In [7]:
newDF = pd.DataFrame(data)
newDF.set_axis(colNames2, axis=1, inplace=True) #set columns names to ingredient IDs
print(newDF.head())
newDF.to_feather("lessSparse100f.fth")

   ig_389  ig_7655  ig_6270  ig_1527  ig_2683  ig_4969  ig_800  ig_5298  \
0   False    False    False    False    False    False   False    False   
1   False    False     True    False     True     True    True    False   
2   False    False    False    False    False    False   False    False   
3   False     True     True    False    False    False   False    False   
4   False    False     True    False    False    False   False    False   

   ig_840  ig_2499  ...  ig_4821  ig_330  ig_2910  ig_3247  ig_4970  ig_7491  \
0   False    False  ...    False   False    False    False    False    False   
1   False    False  ...    False   False    False    False    False    False   
2   False    False  ...    False   False    False    False    False    False   
3   False    False  ...    False   False    False    False    False    False   
4    True    False  ...    False   False    False    False    False    False   

   ig_4527  ig_2457  ig_1253  ig_183  
0    False    False    False 