# Wine

### Introduction:

This exercise is a adaptation from the UCI Wine dataset.
The only pupose is to practice deleting data with pandas.

### Step 1. Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np

### Step 2. Import the dataset from this [address](https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data). 

### Step 3. Assign it to a variable called wine

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
wine = pd.read_csv(url, header=None)

wine.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


### Step 4. Delete the first, fourth, seventh, nineth, eleventh, thirteenth and fourteenth columns

In [4]:
wine = wine.drop(wine.columns[[0,3,6,8,11,12,13]], axis = 1)

wine.head()

Unnamed: 0,1,2,4,5,7,9,10
0,14.23,1.71,15.6,127,3.06,2.29,5.64
1,13.2,1.78,11.2,100,2.76,1.28,4.38
2,13.16,2.36,18.6,101,3.24,2.81,5.68
3,14.37,1.95,16.8,113,3.49,2.18,7.8
4,13.24,2.59,21.0,118,2.69,1.82,4.32


### Step 5. Assign the columns as below:

The attributes are (dontated by Riccardo Leardi, riclea '@' anchem.unige.it):  
1) alcohol  
2) malic_acid  
3) alcalinity_of_ash  
4) magnesium  
5) flavanoids  
6) proanthocyanins  
7) hue 

In [5]:
wine.columns = ['alcohol', 'malic_acid', 'alcalinity_of_ash', 'magnesium', 'flavanoids', 'proanthocyanins', 'hue']
wine.head()

Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,14.23,1.71,15.6,127,3.06,2.29,5.64
1,13.2,1.78,11.2,100,2.76,1.28,4.38
2,13.16,2.36,18.6,101,3.24,2.81,5.68
3,14.37,1.95,16.8,113,3.49,2.18,7.8
4,13.24,2.59,21.0,118,2.69,1.82,4.32


### Step 6. Set the values of the first 3 rows from alcohol as NaN

In [6]:
wine.iloc[0:3, 0] = np.nan
wine.head()

Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,,1.71,15.6,127,3.06,2.29,5.64
1,,1.78,11.2,100,2.76,1.28,4.38
2,,2.36,18.6,101,3.24,2.81,5.68
3,14.37,1.95,16.8,113,3.49,2.18,7.8
4,13.24,2.59,21.0,118,2.69,1.82,4.32


### Step 7. Now set the value of the rows 3 and 4 of magnesium as NaN

In [7]:
wine.iloc[2:4, 3] = np.nan
wine.head()

Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,,1.71,15.6,127.0,3.06,2.29,5.64
1,,1.78,11.2,100.0,2.76,1.28,4.38
2,,2.36,18.6,,3.24,2.81,5.68
3,14.37,1.95,16.8,,3.49,2.18,7.8
4,13.24,2.59,21.0,118.0,2.69,1.82,4.32


### Step 8. Fill the value of NaN with the number 10 in alcohol and 100 in magnesium

In [8]:
wine.alcohol.fillna(10, inplace=True)

wine.magnesium.fillna(100, inplace=True)

wine.head()

Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,10.0,1.71,15.6,127.0,3.06,2.29,5.64
1,10.0,1.78,11.2,100.0,2.76,1.28,4.38
2,10.0,2.36,18.6,100.0,3.24,2.81,5.68
3,14.37,1.95,16.8,100.0,3.49,2.18,7.8
4,13.24,2.59,21.0,118.0,2.69,1.82,4.32


### Step 9. Count the number of missing values

In [9]:
wine.isnull().sum()

alcohol              0
malic_acid           0
alcalinity_of_ash    0
magnesium            0
flavanoids           0
proanthocyanins      0
hue                  0
dtype: int64

In [10]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 7 columns):
alcohol              178 non-null float64
malic_acid           178 non-null float64
alcalinity_of_ash    178 non-null float64
magnesium            178 non-null float64
flavanoids           178 non-null float64
proanthocyanins      178 non-null float64
hue                  178 non-null float64
dtypes: float64(7)
memory usage: 9.8 KB


### Step 10.  Create an array of 10 random numbers up until 10

In [12]:
np.random.randint?

[0;31mDocstring:[0m
randint(low, high=None, size=None, dtype='l')

Return random integers from `low` (inclusive) to `high` (exclusive).

Return random integers from the "discrete uniform" distribution of
the specified dtype in the "half-open" interval [`low`, `high`). If
`high` is None (the default), then results are from [0, `low`).

Parameters
----------
low : int
    Lowest (signed) integer to be drawn from the distribution (unless
    ``high=None``, in which case this parameter is one above the
    *highest* such integer).
high : int, optional
    If provided, one above the largest (signed) integer to be drawn
    from the distribution (see above for behavior if ``high=None``).
size : int or tuple of ints, optional
    Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
    ``m * n * k`` samples are drawn.  Default is None, in which case a
    single value is returned.
dtype : dtype, optional
    Desired dtype of the result. All dtypes are determined by their
    name

In [11]:
random = np.random.randint(10, size = 10)
random

array([6, 5, 8, 5, 4, 9, 2, 1, 6, 1])

### Step 11.  Use random numbers you generated as an index and assign NaN value to each of cell.

In [13]:
wine.alcohol[random] = np.nan
wine.head(10)

Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,10.0,1.71,15.6,127.0,3.06,2.29,5.64
1,,1.78,11.2,100.0,2.76,1.28,4.38
2,,2.36,18.6,100.0,3.24,2.81,5.68
3,14.37,1.95,16.8,100.0,3.49,2.18,7.8
4,,2.59,21.0,118.0,2.69,1.82,4.32
5,,1.76,15.2,112.0,3.39,1.97,6.75
6,,1.87,14.6,96.0,2.52,1.98,5.25
7,14.06,2.15,17.6,121.0,2.51,1.25,5.05
8,,1.64,14.0,97.0,2.98,1.98,5.2
9,,1.35,16.0,98.0,3.15,1.85,7.22


### Step 12.  How many missing values do we have?

In [14]:
wine.isnull().sum()

alcohol              7
malic_acid           0
alcalinity_of_ash    0
magnesium            0
flavanoids           0
proanthocyanins      0
hue                  0
dtype: int64

In [15]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 7 columns):
alcohol              171 non-null float64
malic_acid           178 non-null float64
alcalinity_of_ash    178 non-null float64
magnesium            178 non-null float64
flavanoids           178 non-null float64
proanthocyanins      178 non-null float64
hue                  178 non-null float64
dtypes: float64(7)
memory usage: 9.8 KB


### Step 13. Delete the rows that contain missing values

In [16]:
wine = wine.dropna(axis=0, how='any')
wine.head(20)

Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,10.0,1.71,15.6,127.0,3.06,2.29,5.64
3,14.37,1.95,16.8,100.0,3.49,2.18,7.8
7,14.06,2.15,17.6,121.0,2.51,1.25,5.05
10,14.1,2.16,18.0,105.0,3.32,2.38,5.75
11,14.12,1.48,16.8,95.0,2.43,1.57,5.0
12,13.75,1.73,16.0,89.0,2.76,1.81,5.6
13,14.75,1.73,11.4,91.0,3.69,2.81,5.4
14,14.38,1.87,12.0,102.0,3.64,2.96,7.5
15,13.63,1.81,17.2,112.0,2.91,1.46,7.3
16,14.3,1.92,20.0,120.0,3.14,1.97,6.2


### Step 14. Print only the non-null values in alcohol

In [18]:
check_not_null = wine.alcohol.notnull()
check_not_null

0      True
3      True
7      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
22     True
23     True
24     True
25     True
26     True
27     True
28     True
29     True
30     True
31     True
32     True
33     True
34     True
35     True
36     True
       ... 
148    True
149    True
150    True
151    True
152    True
153    True
154    True
155    True
156    True
157    True
158    True
159    True
160    True
161    True
162    True
163    True
164    True
165    True
166    True
167    True
168    True
169    True
170    True
171    True
172    True
173    True
174    True
175    True
176    True
177    True
Name: alcohol, Length: 171, dtype: bool

In [19]:
wine.alcohol[check_not_null]

0      10.00
3      14.37
7      14.06
10     14.10
11     14.12
12     13.75
13     14.75
14     14.38
15     13.63
16     14.30
17     13.83
18     14.19
19     13.64
20     14.06
21     12.93
22     13.71
23     12.85
24     13.50
25     13.05
26     13.39
27     13.30
28     13.87
29     14.02
30     13.73
31     13.58
32     13.68
33     13.76
34     13.51
35     13.48
36     13.28
       ...  
148    13.32
149    13.08
150    13.50
151    12.79
152    13.11
153    13.23
154    12.58
155    13.17
156    13.84
157    12.45
158    14.34
159    13.48
160    12.36
161    13.69
162    12.85
163    12.96
164    13.78
165    13.73
166    13.45
167    12.82
168    13.58
169    13.40
170    12.20
171    12.77
172    14.16
173    13.71
174    13.40
175    13.27
176    13.17
177    14.13
Name: alcohol, Length: 171, dtype: float64

### Step 15.  Reset the index, so it starts with 0 again

In [20]:
wine = wine.reset_index(drop = True)
wine.head()

Unnamed: 0,alcohol,malic_acid,alcalinity_of_ash,magnesium,flavanoids,proanthocyanins,hue
0,10.0,1.71,15.6,127.0,3.06,2.29,5.64
1,14.37,1.95,16.8,100.0,3.49,2.18,7.8
2,14.06,2.15,17.6,121.0,2.51,1.25,5.05
3,14.1,2.16,18.0,105.0,3.32,2.38,5.75
4,14.12,1.48,16.8,95.0,2.43,1.57,5.0


### BONUS: Create your own question and answer it.