# Introduction To Pandas
## The Basics Of DataFrames

#### Importing pandas

In [None]:
import pandas as pd

#### Reading data into a DataFrame

In [None]:
red_wine_quality = pd.read_csv("data/winequality-red.csv", delimiter=';')

#### Getting a sense of the data

##### How many rows and columns are in the DataFrame?

In [None]:
red_wine_quality.shape

- How many rows and columns are in the DataFrame?
> The DataFrame includes `1599` rows and `12` columns.

In [None]:
for col in red_wine_quality.columns:
    print(col)

##### What data type is in each column?

| Column Name          | Data Types |
|----------------------|------------|
| fixed acidity        | float64    |
| volatile acidity     | float64    |
| citric acid          | float64    |
| residual sugar       | float64    |
| chlorides            | float64    |
| free sulfur dioxide  | float64    |
| total sulfur dioxide | float64    |
| density              | float64    |
| pH                   | float64    |
| sulphates            | float64    |
| alcohol              | float64    |
| quality              | int64      |

##### Are all of the variables continuous, or are any categorical?
> All of the variables are contiuous.

##### How many non-null values are in each column?

In [None]:
red_wine_quality.isna().sum()

##### What are the min, mean, max, median for all numeric columns?

In [None]:
red_wine_quality.describe()

In [None]:
red_wine_quality.info()

## Practice with Grabbing Data

##### Grab the first 10 rows of the `chlorides` column. 

In [None]:
red_wine_quality[['chlorides']][0:10]

##### Grab the last 10 rows of the `chlorides` column. 

In [None]:
red_wine_quality[['chlorides']][-10:]

##### Grab indices 264-282 of the `chlorides` **and** `density` columns. 

In [None]:
red_wine_quality[['chlorides', 'density']][264:282]

##### Grab all rows where the `chlorides` value is less than 0.10.

In [None]:
is_lower = red_wine_quality['chlorides'] < .1
filtered = red_wine_quality[is_lower]
filtered[['chlorides']]

##### Now grab all the rows where the `chlorides` value is greater than the column's mean (try **not** to use a hard-coded value for the mean, but instead a method).

In [None]:
red_wine_quality['chlorides'].mean()

In [None]:
def row_value_is_greater_than(df, column_name, value):
    condition = df[column_name] > value
    filtered = df[condition]
    return filtered[[column_name]]

In [None]:
def is_bigger_than_mean(df, column_name):
    return row_value_is_greater_than(df, column_name, df[column_name].mean())

In [None]:
is_bigger_as_mean(red_wine_quality, 'chlorides')

##### Grab all those rows where the `pH` is greater than 3.0 and less than 3.5.

In [None]:
column_name = 'pH'
left_value = 3.0
right_value = 3.5

red_wine_quality['pH'].between(3, 3.5, inclusive=True)
red_wine_quality

In [None]:
column_name = 'pH'
left_value = 3.0
right_value = 3.5

is_greater_than = 3.0 < red_wine_quality['pH']
filtered = red_wine_quality[is_greater_than]
is_less_than = red_wine_quality['pH'] < 3.5
filtered = red_wine_quality[is_less_than]
filtered

##### Further filter the results from 6 to grab only those rows that have a `residual sugar` less than 2.0. 

In [None]:
has_less_sugar = filtered['residual sugar'] < 2.0
filtered = filtered[has_less_sugar]
filtered

##### Get the average amount of `chlorides` for each `quality` value.

In [None]:
red_wine_quality.groupby('quality')['chlorides'].mean()

##### For observations with a `pH` greater than 3.0 and less than 4.0, find the average alcohol value by `pH`

In [None]:
is_greater_than = 3.0 <= red_wine_quality['pH']
is_smaller_than = red_wine_quality['pH'] < 4.0
filtered = red_wine_quality[is_greater_than]
filtered = red_wine_quality[is_smaller_than]
filtered['alcohol'].mean()

##### For observations with an `alcohol` value between 9.25 and 9.5, find the highest amount of `residual sugar`.

In [None]:
def is_in_range(df, column_name, low, high):
    is_greater_than = low <= df[column_name]
    is_smaller_than = df[column_name] < high
    filtered = df[is_greater_than]
    filtered = df[is_smaller_than]
    return filtered

In [None]:
is_in_range(red_wine_quality, 'alcohol', 9.25, 9.5)['residual sugar'].max()

##### Create a new column, called `total_acidity`, that is the sum of `fixed acidity` and `volatile acidity`.

In [None]:
df = red_wine_quality
df['total_acidity'] = df['fixed acidity'] + df['volatile acidity']

In [None]:
df

##### Find the average `total_acidity` for each of the `quality` values

In [None]:
red_wine_quality.groupby('quality')['total_acidity'].mean()

##### Find the top 5 `density` values.

In [None]:
red_wine_quality.sort_values('density', ascending=False)['density'].head()

In [136]:
red_wine_quality.sort_values('sulphates')['sulphates'].head(10)

170     0.33
1369    0.37
1287    0.37
1347    0.39
1348    0.39
65      0.39
837     0.39
64      0.39
836     0.39
1237    0.40
Name: sulphates, dtype: float64