### Imports

In [23]:
import pandas as pd 
import numpy as np

sales = pd.read_csv("sales.csv")
print(sales.head())

   Unnamed: 0  store type  department        date  weekly_sales  is_holiday  \
0           0      1    A           1  2010-02-05      24924.50       False   
1           1      1    A           1  2010-03-05      21827.90       False   
2           2      1    A           1  2010-04-02      57258.43       False   
3           3      1    A           1  2010-05-07      17413.94       False   
4           4      1    A           1  2010-06-04      17558.09       False   

   temperature_c  fuel_price_usd_per_l  unemployment  
0       5.727778              0.679451         8.106  
1       8.055556              0.693452         8.106  
2      16.816667              0.718284         7.808  
3      22.527778              0.748928         7.808  
4      27.050000              0.714586         7.808  


In [2]:
# dataset has 10 774 rows and 10 columns
sales.shape

(10774, 10)

## Dropping duplicate pairs

In [4]:
# Drop duplicate store/type combinations
store_types = sales.drop_duplicates(subset=["store", "type"])
print(store_types.head())

      Unnamed: 0  store type  department        date  weekly_sales  \
0              0      1    A           1  2010-02-05      24924.50   
901          901      2    A           1  2010-02-05      35034.06   
1798        1798      4    A           1  2010-02-05      38724.42   
2699        2699      6    A           1  2010-02-05      25619.00   
3593        3593     10    B           1  2010-02-05      40212.84   

      is_holiday  temperature_c  fuel_price_usd_per_l  unemployment  
0          False       5.727778              0.679451         8.106  
901        False       4.550000              0.679451         8.324  
1798       False       6.533333              0.686319         8.623  
2699       False       4.683333              0.679451         7.259  
3593       False      12.411111              0.782478         9.765  


In [5]:
# Drop duplicate store/department combinations
store_depts = sales.drop_duplicates(subset=["store", "department"])
print(store_depts.head())

    Unnamed: 0  store type  department        date  weekly_sales  is_holiday  \
0            0      1    A           1  2010-02-05      24924.50       False   
12          12      1    A           2  2010-02-05      50605.27       False   
24          24      1    A           3  2010-02-05      13740.12       False   
36          36      1    A           4  2010-02-05      39954.04       False   
48          48      1    A           5  2010-02-05      32229.38       False   

    temperature_c  fuel_price_usd_per_l  unemployment  
0        5.727778              0.679451         8.106  
12       5.727778              0.679451         8.106  
24       5.727778              0.679451         8.106  
36       5.727778              0.679451         8.106  
48       5.727778              0.679451         8.106  


In [8]:
# Subset the rows where is_holiday is True and drop duplicate dates
holiday_dates = sales[sales["is_holiday"]!= False].drop_duplicates(subset="date")

# Print date col of holiday_dates
print(holiday_dates["date"])

498     2010-09-10
691     2011-11-25
2315    2010-02-12
6735    2012-09-07
6810    2010-12-31
6815    2012-02-10
6820    2011-09-09
Name: date, dtype: object


## Counting categorical variables

Counting is a great way to get an overview of the data and to spot curiosities that you might not notice otherwise.

In [12]:
# Count the number of stores of each type
store_counts = store_types["type"].value_counts()
print(store_counts)

# Get the proportion of stores of each type
store_props = store_types["type"].value_counts(normalize=True)
print(store_props)

A    11
B     1
Name: type, dtype: int64
A    0.916667
B    0.083333
Name: type, dtype: float64


In [14]:
# Count the number of each department number and sort
dept_counts_sorted = store_depts["department"].value_counts(sort=True)
print(dept_counts_sorted)

# Get the proportion of departments of each number and sort
dept_props_sorted = store_depts["department"].value_counts(sort=True, normalize=True)
print(dept_props_sorted)

1     12
55    12
72    12
71    12
67    12
      ..
37    10
48     8
50     6
39     4
43     2
Name: department, Length: 80, dtype: int64
1     0.012917
55    0.012917
72    0.012917
71    0.012917
67    0.012917
        ...   
37    0.010764
48    0.008611
50    0.006459
39    0.004306
43    0.002153
Name: department, Length: 80, dtype: float64


### Implementation for Dogs dataset

In [15]:
# importing dataset
dogs = pd.read_csv("dogs.csv")
print(dogs)

     name             breed  color  height_cm  weight_kg date_of_birth
0   Bella         Chihuahua  Brown         18          2    2018-02-05
1   Amigo          Labrador  Black         59         35    2016-08-12
2  Trevis       St. Bernard  Brown         77         73    2019-07-24
3   Golin             Husky  White         55         30    2015-06-18
4    Lucy          Labrador  White         51         26    2020-04-29
5     Max  Golden Retriever  Brown         49         21    2014-01-20
6    Otto            Poodle  Brown         42         20    2013-06-27
7    Rexo   German Shepherd  Brown         54         24    2018-05-21


In [22]:
print(dogs["name"])

0     Bella
1     Amigo
2    Trevis
3     Golin
4      Lucy
5       Max
6      Otto
7      Rexo
Name: name, dtype: object


In [35]:
# re-creating dogs names
new_names = ["Bella", "Amigo", "Golin", "Max", "Bella", "Max", "Bella", "Golin"]
dogs["name"] = np.array(new_names)

# creating new column with vet visits
dogs["date_of_visit"] = ["2022-12-08", "2022-05-16", "2022-06-19", "2022-01-08", "2022-01-02", "2022-03-07", "2022-07-14", "2022-10-18"]
print(dogs)

    name             breed  color  height_cm  weight_kg date_of_birth  \
0  Bella         Chihuahua  Brown         18          2    2018-02-05   
1  Amigo          Labrador  Black         59         35    2016-08-12   
2  Golin       St. Bernard  Brown         77         73    2019-07-24   
3    Max             Husky  White         55         30    2015-06-18   
4  Bella          Labrador  White         51         26    2020-04-29   
5    Max  Golden Retriever  Brown         49         21    2014-01-20   
6  Bella            Poodle  Brown         42         20    2013-06-27   
7  Golin   German Shepherd  Brown         54         24    2018-05-21   

  date_of_visit  
0    2022-12-08  
1    2022-05-16  
2    2022-06-19  
3    2022-01-08  
4    2022-01-02  
5    2022-03-07  
6    2022-07-14  
7    2022-10-18  


In [32]:
# dropping unnecessary columns
vet_visits = dogs.drop(["breed", "color", "date_of_birth"], axis='columns').sort_values("date_of_visit", ascending=False)

# new dataset vet_visits
print(vet_visits)

    name  height_cm  weight_kg date_of_visit
0  Bella         18          2    2022-12-08
7  Golin         54         24    2022-10-18
6  Bella         42         20    2022-07-14
2  Golin         77         73    2022-06-19
1  Amigo         59         35    2022-05-16
5    Max         49         21    2022-03-07
3    Max         55         30    2022-01-08
4  Bella         51         26    2022-01-02


In [34]:
# now we have prepared a new dataset with dogs visits at vet
vet_visits.to_csv(r'vet_visits.csv', index=False)

In [39]:
# count the number of dogs and sort
dogs_counts = vet_visits["name"].value_counts(sort=True)
print(dogs_counts)

# get the proportion of dogs and sort
dogs_proportions = vet_visits["name"].value_counts(sort=True, normalize=True)
print(dogs_proportions)

Bella    3
Golin    2
Max      2
Amigo    1
Name: name, dtype: int64
Bella    0.375
Golin    0.250
Max      0.250
Amigo    0.125
Name: name, dtype: float64


In [41]:
# drop duplicate names
dogs_unique = vet_visits.drop_duplicates(subset=["name"])
print(dogs_unique["name"])

0    Bella
7    Golin
1    Amigo
5      Max
Name: name, dtype: object


Unique dog names in our dataset are: Bella, Golin, Amigo, Max.