

Научимся фильтровать и преобразовывать данные:

- Познакомимся с методом [query](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html)
- Освоим другие приемы по работе с данными

In [1]:
import pandas as pd
import numpy as np

In [2]:
student_performance = pd.read_csv('data/StudentsPerformance.csv')

In [3]:
student_performance.gender

0      female
1      female
2      female
3        male
4        male
        ...  
995    female
996      male
997    female
998    female
999    female
Name: gender, Length: 1000, dtype: object

In [4]:
student_performance.gender == 'female'

0       True
1       True
2       True
3      False
4      False
       ...  
995     True
996    False
997     True
998     True
999     True
Name: gender, Length: 1000, dtype: bool

In [5]:
student_performance.loc[student_performance.gender == 'female']

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
...,...,...,...,...,...,...,...,...
993,female,group D,bachelor's degree,free/reduced,none,62,72,74
995,female,group E,master's degree,standard,completed,88,99,95
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [6]:
student_performance.loc[student_performance.gender == 'female', ['gender', 'writing score']]

Unnamed: 0,gender,writing score
0,female,74
1,female,88
2,female,93
5,female,78
6,female,92
...,...,...
993,female,74
995,female,95
997,female,65
998,female,77


In [7]:
student_performance.mean()

math score       66.089
reading score    69.169
writing score    68.054
dtype: float64

In [8]:
mean_writing_score = student_performance['writing score'].mean()

In [9]:
student_performance.loc[student_performance['writing score'] > mean_writing_score]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
...,...,...,...,...,...,...,...,...
992,female,group D,associate's degree,free/reduced,none,55,76,76
993,female,group D,bachelor's degree,free/reduced,none,62,72,74
995,female,group E,master's degree,standard,completed,88,99,95
998,female,group D,some college,standard,completed,68,78,77


In [10]:
query_female_wc = (student_performance['writing score'] > 80) & (student_performance.gender == 'female')

In [11]:
student_performance.loc[query_female_wc].head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
6,female,group B,some college,standard,completed,88,95,92
36,female,group D,associate's degree,standard,none,74,81,83
38,female,group D,associate's degree,free/reduced,completed,75,90,88


In [13]:
student_performance.loc[(student_performance['writing score'] > 80) 
                        & (student_performance.gender == 'female')].head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
6,female,group B,some college,standard,completed,88,95,92
36,female,group D,associate's degree,standard,none,74,81,83
38,female,group D,associate's degree,free/reduced,completed,75,90,88


- Нужно указать приоритет операций, взяв условия в скобки [(condition) и (condition)]
- При работе с pandas series для логического И нужно использовать & вместо уже знакомого and 

https://stackoverflow.com/questions/36921951/truth-value-of-a-series-is-ambiguous-use-a-empty-a-bool-a-item-a-any-o

# Вопрос

У какой доли студентов из датасэта в колонке lunch указано free/reduced?

In [49]:
student_performance['lunch'].value_counts(normalize=True)

standard        0.645
free/reduced    0.355
Name: lunch, dtype: float64

In [51]:
(student_performance.lunch == 'free/reduced').mean()

0.355

In [54]:
student_performance.loc[student_performance.lunch == 'free/reduced'].size/student_performance.size

0.355

In [55]:
student_performance.loc[student_performance.lunch == 'free/reduced'].shape[0]/student_performance.shape[0]

0.355

# Вопрос

Как различается [среднее](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mean.html) и [дисперсия](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.var.html) оценок по предметам у групп студентов со стандартным или урезанным ланчем?


- дисперсия оценок студентов со стандартным ланчем меньше
- студенты со стандартным ланчем имеют в среднем более хорошие оценки 

In [56]:
student_performance.loc[(student_performance.lunch == 'free/reduced')].mean()

math score       58.921127
reading score    64.653521
writing score    63.022535
dtype: float64

In [59]:
student_performance.loc[(student_performance.lunch == 'standard')].mean()

math score       70.034109
reading score    71.654264
writing score    70.823256
dtype: float64

In [61]:
student_performance.loc[(student_performance.lunch == 'free/reduced')].var()

math score       229.824270
reading score    221.871139
writing score    238.202881
dtype: float64

In [60]:
student_performance.loc[(student_performance.lunch == 'standard')].var()

math score       186.418089
reading score    191.285560
writing score    205.620887
dtype: float64

In [62]:
student_performance.groupby('lunch').agg(['mean', 'var'])

Unnamed: 0_level_0,math score,math score,reading score,reading score,writing score,writing score
Unnamed: 0_level_1,mean,var,mean,var,mean,var
lunch,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
free/reduced,58.921127,229.82427,64.653521,221.871139,63.022535,238.202881
standard,70.034109,186.418089,71.654264,191.28556,70.823256,205.620887


In [64]:
student_performance.groupby('lunch').mean() 

Unnamed: 0_level_0,math score,reading score,writing score
lunch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
free/reduced,58.921127,64.653521,63.022535
standard,70.034109,71.654264,70.823256


In [65]:
student_performance.groupby('lunch').var()  

Unnamed: 0_level_0,math score,reading score,writing score
lunch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
free/reduced,229.82427,221.871139,238.202881
standard,186.418089,191.28556,205.620887


# Продолжим 

In [14]:
student_performance = student_performance\
                                    .rename(columns=
                                            {'parental level of education': 'parental_level_of_education',
                                            'test preparation course': 'test_preparation_course',
                                            'math score': 'math_score',
                                            'reading score': 'reading_score',
                                            'writing score': 'writing_score'})

или так 
```python
students_performance.columns = students_performance.columns.str.replace(' ','_')
# или
students_performance.columns = [x.replace(" ", "_") for x in students_performance.columns]
```

заменем пробелы на нижнее подчеркивание что бы обращаться к столбцам через точку 

### Запрос через функцию query()

In [16]:
student_performance.query('writing_score > 74')

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
...,...,...,...,...,...,...,...,...
991,female,group B,some high school,standard,completed,65,82,78
992,female,group D,associate's degree,free/reduced,none,55,76,76
995,female,group E,master's degree,standard,completed,88,99,95
998,female,group D,some college,standard,completed,68,78,77


In [72]:
student_performance.query('gender == "male"')

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
10,male,group C,associate's degree,standard,none,58,54,52
...,...,...,...,...,...,...,...,...
985,male,group A,high school,standard,none,57,51,54
987,male,group E,some high school,standard,completed,81,75,76
990,male,group E,high school,free/reduced,completed,86,81,75
994,male,group A,high school,standard,none,63,63,62


In [73]:
student_performance.query('gender == "male" & writing_score > 74')

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
4,male,group C,some college,standard,none,76,78,75
16,male,group C,high school,standard,none,88,89,86
24,male,group D,bachelor's degree,free/reduced,completed,74,71,80
34,male,group E,some college,standard,none,97,87,82
35,male,group E,associate's degree,standard,completed,81,81,79
...,...,...,...,...,...,...,...,...
956,male,group C,some college,standard,none,84,87,81
981,male,group D,some high school,standard,none,81,78,78
982,male,group B,some high school,standard,completed,79,85,86
987,male,group E,some high school,standard,completed,81,75,76


### Можем обращаться к переменным в окружении через префикс '@' 

In [17]:
writing_score_num = 90

In [18]:
student_performance.query('writing_score > @writing_score_num')

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
2,female,group B,master's degree,standard,none,90,95,93
6,female,group B,some college,standard,completed,88,95,92
94,female,group B,some college,standard,none,79,86,92
106,female,group D,master's degree,standard,none,87,100,100
110,female,group D,associate's degree,free/reduced,completed,77,89,98
...,...,...,...,...,...,...,...,...
962,female,group E,associate's degree,standard,none,100,100,100
970,female,group D,bachelor's degree,standard,none,89,100,100
979,female,group C,associate's degree,standard,none,91,95,94
983,female,group A,some college,standard,completed,78,87,91


Соотнесите строчку кода и результат её выполнения. Переменная student_stats содержит датафрэйм с данными о студентах. 

Чтобы было удобно, сойдёмся на одинаковых терминах

- parental level of education - уровень образования родителей
- bachelor's degree - бакалаврская степень
- master's degree - магистерская степень
- test preparation course - тест
- наблюдения, студенты, строки - одно и то же в данном случае


Верно! Обратите внимание, query не применяется к колонкам, название которых содержит недопустимые символы (типа пробел, слэша). 

Ещё пример query, аналогичный **isin()** - 
```python
variants = ['var1', 'var2']

df.query('col1 == @variants')
```

In [19]:
student_stats = pd.read_csv('data/StudentsPerformance.csv')

In [20]:
# отберёт студентов, уровень образования родителей которых бакалавр или магистр
student_stats[student_stats['parental level of education'].isin(["bachelor's degree", "master's degree"])]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
2,female,group B,master's degree,standard,none,90,95,93
14,female,group A,master's degree,standard,none,50,53,58
18,male,group C,master's degree,free/reduced,completed,46,42,46
24,male,group D,bachelor's degree,free/reduced,completed,74,71,80
...,...,...,...,...,...,...,...,...
957,female,group D,master's degree,standard,none,92,100,100
969,female,group B,bachelor's degree,standard,none,75,84,80
970,female,group D,bachelor's degree,standard,none,89,100,100
993,female,group D,bachelor's degree,free/reduced,none,62,72,74


In [21]:
# наблюдения, оценка за математику которых выше 90 и за чтение меньше либо равна 72
student_stats[(student_stats['math score'] > 90) & (student_stats['reading score'] <=72)]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score


In [22]:
# студенты, не прошедшие тест
student_stats[student_stats['test preparation course'] != 'completed']

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
...,...,...,...,...,...,...,...,...
992,female,group D,associate's degree,free/reduced,none,55,76,76
993,female,group D,bachelor's degree,free/reduced,none,62,72,74
994,male,group A,high school,standard,none,63,63,62
996,male,group C,high school,free/reduced,none,62,55,55


In [23]:
# студенты со стандартным ланчем
student_stats.query(" lunch == 'standard' ")

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
...,...,...,...,...,...,...,...,...
987,male,group E,some high school,standard,completed,81,75,76
991,female,group B,some high school,standard,completed,65,82,78
994,male,group A,high school,standard,none,63,63,62
995,female,group E,master's degree,standard,completed,88,99,95


In [24]:
score_columns = [i for i in list(student_performance) if 'score' in i]

In [25]:
student_performance[score_columns].head()

Unnamed: 0,math_score,reading_score,writing_score
0,72,72,74
1,69,90,88
2,90,95,93
3,47,57,44
4,76,78,75


In [26]:
student_performance.filter(like='score')

Unnamed: 0,math_score,reading_score,writing_score
0,72,72,74
1,69,90,88
2,90,95,93
3,47,57,44
4,76,78,75
...,...,...,...
995,88,99,95
996,62,55,55
997,59,71,65
998,68,78,77


In [28]:
# Отберем строчки и переменуем их
student_performance_with_names = student_performance.iloc[[0, 3, 4, 6, 7]]
student_performance_with_names.index = ['Cersei', 'Tywin', 'Gregor', 'Joffrey', 'Ilyn Payne']

# axis = 0 отбирает по строкам; axis = 1 отбирает по столбцам
# отбираем строчки в название которых содердиться буква 'е'
student_performance_with_names.filter(like='e', axis=0)

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
Cersei,female,group B,bachelor's degree,standard,none,72,72,74
Gregor,male,group C,some college,standard,none,76,78,75
Joffrey,female,group B,some college,standard,completed,88,95,92
Ilyn Payne,male,group B,some college,free/reduced,none,40,43,39


# Вопрос

Изучите документацию метода [filter](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.filter.html) и укажите правильные утверждения

Также есть ещё похожий метод [select_dtypes](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.select_dtypes.html), отбирающий по типам колонок.

Очень полезный метод для больших датафрэймов.

Для применения regex, ознакомьтесь с регулярными выражениями в питоне (библиотека re), в regex можно указывать как паттерн в виде строки, так и скомпилированный в re паттерн.

axis можно указывать как 0, 1 или как 'index', 'columns' соответственно

- Аргумент items отбирает лэйблы, переданные в коллекции в качестве значения
- Аргумент axis отвечает за то, какие лэйблы мы рассматриваем - индекс или колонки
- Аргумент like отбирает те лэйблы, где встречается поданное значение
- Можно использовать только один из аргументов items, like, regex
- Аргумент regex отбирает лэйблы, подходящие к переданному паттерну 

# Задание

Важной частью нашего курса будут задачи на программирование. Давайте попробуем решить первую задачу вместе.

В переменной df  сохранен датафрэйм с произвольным числом колонок и строк. Отберите колонки, в которых есть '-' в датафрэйме df. Сохраните их в переменную selected_columns

In [29]:
df = pd.read_csv("data/column_hell.csv")

In [30]:
df.head()

Unnamed: 0,yrh&6,ohku,q,hfxx,umeca-4,apj,g,ijvnixxo,kudbpyw-3,upubjv,...,fi,clsruyy,g&6,qo$6,ap,o,sladymzr,tif,wuikzxt,wzuvc
0,0.288397,0.911359,0.488069,0.074599,0.016124,0.090555,0.56081,0.110975,0.430787,0.548763,...,0.793122,0.070335,0.145115,0.686579,0.071681,0.563068,0.815201,0.081902,0.333237,0.185148
1,0.861276,0.134987,0.701885,0.897921,0.507115,0.740151,0.963179,0.712024,0.671662,0.497596,...,0.477181,0.767931,0.996055,0.948774,0.054327,0.087341,0.072166,0.856081,0.120481,0.622113
2,0.775577,0.832874,0.862665,0.831049,0.003274,0.703112,0.509795,0.573296,0.286411,0.282792,...,0.421542,0.770413,0.749468,0.395022,0.307752,0.535575,0.341393,0.971201,0.714558,0.702453
3,0.85791,0.555818,0.598305,0.473988,0.784139,0.427963,0.725103,0.28661,0.863717,0.545562,...,0.240252,0.010355,0.759912,0.965837,0.110719,0.439488,0.816526,0.111621,0.373365,0.461615
4,0.748652,0.451725,0.252619,0.584361,0.011265,0.051115,0.755444,0.627191,0.206172,0.955043,...,0.846165,0.043458,0.209347,0.028276,0.700446,0.203503,0.181903,0.428842,0.099398,0.175227


In [31]:
selected_columns = df.filter(like='-')

In [32]:
selected_columns

Unnamed: 0,umeca-4,kudbpyw-3,g-3,v-9,fla-6,hwtbyrkp-5
0,0.016124,0.430787,0.202407,0.725508,0.305076,0.979296
1,0.507115,0.671662,0.864249,0.867818,0.358859,0.160520
2,0.003274,0.286411,0.928865,0.719159,0.340235,0.734612
3,0.784139,0.863717,0.175446,0.554744,0.049325,0.621132
4,0.011265,0.206172,0.044151,0.163643,0.048806,0.437836
...,...,...,...,...,...,...
95,0.220817,0.434444,0.601738,0.146333,0.918888,0.216499
96,0.474131,0.176011,0.359793,0.104707,0.257690,0.066934
97,0.091637,0.871606,0.121075,0.907699,0.313184,0.788592
98,0.360106,0.172385,0.015285,0.019571,0.060751,0.403166
