In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [50]:
ts1 = pd.read_csv(r"Data/TS1.csv", encoding='unicode_escape')
ts2 = pd.read_csv(r"Data/TS2.csv", encoding='unicode_escape')
ts8 = pd.read_csv(r"Data/TS8.csv", encoding='unicode_escape', on_bad_lines='skip', low_memory=False)

## Data Cleaning

In [51]:
ts1.head()

Unnamed: 0,Rep,Cust,Customer Name,Group,Part Number,Description,Doc,Line,Date,Time,Qty,Unit,Disc%,Cost,Range
0,2,193750,ROCK AUTO LLC,ACC,ACC31019,LITERIDER 73-96 F-Series,01V4661208,1,Nov 07 22,08:22,1,308.74,,262.33,28
1,2,193750,ROCK AUTO LLC,ACC,ACC31029,LITERIDER 73-96 F-Series,01V4786521,2,Jan 04 23,16:33,1,296.23,,248.63,28
2,2,193750,ROCK AUTO LLC,ACC,ACC31109,LITERIDER 82-11 Ranger &,01V4689424,1,Nov 20 22,15:23,1,283.15,,237.97,28
3,2,193750,ROCK AUTO LLC,ACC,ACC31109,LITERIDER 82-11 Ranger &,01V4771133,1,Dec 29 22,08:28,1,283.15,,236.97,28
4,2,193750,ROCK AUTO LLC,ACC,ACC31129,LITERIDER 01-06 Explorer,01V4683310,1,Nov 16 22,15:27,1,283.15,,243.5,28


In [52]:
ts1.drop(columns=['Rep', 'Cust', 'Customer Name', 'Description', 'Doc', 'Disc%', 'Range'], inplace=True)
ts2.drop(columns=['Rep', 'Cust', 'Customer Name', 'Description', 'Doc', 'Disc%', 'Range'], inplace=True)
ts8.drop(columns=['Rep', 'Cust', 'Customer Name', 'Description', 'Doc', 'Disc%', 'Range'], inplace=True)

In [53]:
df_list = [ts1, ts2, ts8]

for i in df_list:
    print(f'Missing Values: {i.isnull().sum()}')

Missing Values: Group          8
Part Number    0
Line           0
Date           0
Time           0
Qty            0
Unit           0
Cost           0
dtype: int64
Missing Values: Group          1
Part Number    0
Line           0
Date           0
Time           0
Qty            0
Unit           0
Cost           0
dtype: int64
Missing Values: Group           1
Part Number     0
Line            0
Date            0
Time            0
Qty             0
Unit           21
Cost            0
dtype: int64


In [54]:
for i in df_list:
    i.dropna(inplace=True)

In [55]:
df_list = [ts1, ts2, ts8]

for i in df_list:
    print(f'Missing Values: {i.isnull().sum()}')

Missing Values: Group          0
Part Number    0
Line           0
Date           0
Time           0
Qty            0
Unit           0
Cost           0
dtype: int64
Missing Values: Group          0
Part Number    0
Line           0
Date           0
Time           0
Qty            0
Unit           0
Cost           0
dtype: int64
Missing Values: Group          0
Part Number    0
Line           0
Date           0
Time           0
Qty            0
Unit           0
Cost           0
dtype: int64


In [56]:
for i in df_list:
    print(f'Duplicate Values: {i.duplicated().sum()}')

Duplicate Values: 737
Duplicate Values: 639
Duplicate Values: 540


In [57]:
for i in df_list:
    i.drop_duplicates(inplace=True)

In [58]:
for i in df_list:
    print(f'Duplicate Values: {i.duplicated().sum()}')

Duplicate Values: 0
Duplicate Values: 0
Duplicate Values: 0


In [59]:
for i in df_list:
    print(i.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 222302 entries, 0 to 223046
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Group        222302 non-null  object 
 1   Part Number  222302 non-null  object 
 2   Line         222302 non-null  int64  
 3   Date         222302 non-null  object 
 4   Time         222302 non-null  object 
 5   Qty          222302 non-null  int64  
 6   Unit         222302 non-null  float64
 7   Cost         222302 non-null  float64
dtypes: float64(2), int64(2), object(4)
memory usage: 15.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 163148 entries, 0 to 163787
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Group        163148 non-null  object 
 1   Part Number  163148 non-null  object 
 2   Line         163148 non-null  int64  
 3   Date         163148 non-null  object 
 4   Time         163148

In [60]:
for i in df_list:
    i['Date'] = pd.to_datetime(i['Date'])

In [61]:
ts1.head()

Unnamed: 0,Group,Part Number,Line,Date,Time,Qty,Unit,Cost
0,ACC,ACC31019,1,2022-11-07,08:22,1,308.74,262.33
1,ACC,ACC31029,2,2023-01-04,16:33,1,296.23,248.63
2,ACC,ACC31109,1,2022-11-20,15:23,1,283.15,237.97
3,ACC,ACC31109,1,2022-12-29,08:28,1,283.15,236.97
4,ACC,ACC31129,1,2022-11-16,15:27,1,283.15,243.5


In [62]:
ts1.set_index(['Group'], inplace=True)

In [63]:
ts1.head()

Unnamed: 0_level_0,Part Number,Line,Date,Time,Qty,Unit,Cost
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ACC,ACC31019,1,2022-11-07,08:22,1,308.74,262.33
ACC,ACC31029,2,2023-01-04,16:33,1,296.23,248.63
ACC,ACC31109,1,2022-11-20,15:23,1,283.15,237.97
ACC,ACC31109,1,2022-12-29,08:28,1,283.15,236.97
ACC,ACC31129,1,2022-11-16,15:27,1,283.15,243.5


In [65]:
skp_ts1 = ts1['Group']
skp_ts1.head()

KeyError: 'Group'