# Машинное обучение и анализ данных

## Высшая Школа Цифровой Культуры

## Библиотека $\mathsf{pandas}$

Подключение основных возможностей библиотеки $\mathsf{pandas}$ осуществляется двумя строчками кода: 

In [1]:
from pandas import Series, DataFrame
import pandas as pd

In [3]:
numbers = Series([1, -3, 2, 7]) #Преобразование списка в серию
print(numbers) #Вывод серии

0    1
1   -3
2    2
3    7
dtype: int64


In [4]:
numbers = Series([1, -3, 2, 7]) #Преобразование списка в серию
print(numbers.index) #Вывод индексов в формате индексов pandas
print(numbers.values) #Вывод значений серии

RangeIndex(start=0, stop=4, step=1)
[ 1 -3  2  7]


In [5]:
numbers = Series([1, -3, 2, 7], index=['d', 'b', 'a', 'c']) #Создание серии на основе списка значений и списка индексов
print(numbers) #Вывод серии

d    1
b   -3
a    2
c    7
dtype: int64


In [12]:
numbers = Series([1, -3, 2, 7], index=['a', 'b', 'a', 'c']) #Серия, имеющая пару одинаковых индексов

In [15]:
numbers['a'].values #Вывод значений серии, отвечающих индексу 'a'

array([1, 2])

In [16]:
numbers[['a', 'c']].values #Вывод значений серии, отвечающих индексам 'a' и 'c'

array([1, 2, 7])

In [17]:
numbers > 0

a     True
b    False
a     True
c     True
dtype: bool

In [18]:
numbers[numbers > 0]

a    1
a    2
c    7
dtype: int64

In [19]:
numbers + 2

a    3
b   -1
a    4
c    9
dtype: int64

In [20]:
person = {'name': 'Степан', 'gender': 'муж.', 'age': 20} #Создание словаря
person_series = Series(person) #Преобразование словаря в серию

In [21]:
person_series #Вывод серии данных

name      Степан
gender      муж.
age           20
dtype: object

In [22]:
cars = {'name': ['ford torino', 'peugeot 504', 'fiat 124b'], 'hp': [140, 87, 76]} #Создание словаря из списков
cars_frame = DataFrame(cars) #Создание фрейма данных на основе словаря cars

In [23]:
cars_frame #Вывод фрейма данных

Unnamed: 0,name,hp
0,ford torino,140
1,peugeot 504,87
2,fiat 124b,76


In [24]:
cars_frame['name']

0    ford torino
1    peugeot 504
2      fiat 124b
Name: name, dtype: object

In [25]:
cars_frame.name

0    ford torino
1    peugeot 504
2      fiat 124b
Name: name, dtype: object

In [26]:
cars = {'name': ['ford torino', 'peugeot 504', 'fiat 124b'],
        'hp': [140, 87, 76],
        'weight': [3449, 2672, 2065]}
cars_frame = DataFrame(cars, columns=['name', 'weight', 'hp'],
                             index=['one', 'two', 'three'])

In [27]:
cars_frame

Unnamed: 0,name,weight,hp
one,ford torino,3449,140
two,peugeot 504,2672,87
three,fiat 124b,2065,76


In [28]:
cars_frame = DataFrame(cars, columns=['weight', 'hp'],
                             index=cars['name'])

In [29]:
cars_frame

Unnamed: 0,weight,hp
ford torino,3449,140
peugeot 504,2672,87
fiat 124b,2065,76


In [30]:
cars_frame.loc['ford torino']

weight    3449
hp         140
Name: ford torino, dtype: int64

In [31]:
cars_frame.loc['ford torino'].to_frame()

Unnamed: 0,ford torino
weight,3449
hp,140


In [32]:
cars_frame.loc['ford torino'].to_frame().T

Unnamed: 0,weight,hp
ford torino,3449,140


In [33]:
cars_frame.loc['peugeot 504']['hp']

87

In [34]:
cars_frame.loc['peugeot 504']['hp'] = 140

In [35]:
cars_frame

Unnamed: 0,weight,hp
ford torino,3449,140
peugeot 504,2672,140
fiat 124b,2065,76


In [36]:
new_cars_frame = cars_frame.drop(['fiat 124b'])

In [37]:
new_cars_frame

Unnamed: 0,weight,hp
ford torino,3449,140
peugeot 504,2672,140


In [38]:
new_cars_frame = cars_frame.drop(['hp'], axis=1)

In [39]:
new_cars_frame

Unnamed: 0,weight
ford torino,3449
peugeot 504,2672
fiat 124b,2065


In [42]:
numbers = Series([1, -3, 2, 7])
sorted_numbers = numbers.sort_values()
print(numbers)
print(sorted_numbers)

0    1
1   -3
2    2
3    7
dtype: int64
1   -3
0    1
2    2
3    7
dtype: int64


In [43]:
numbers = Series([1, -3, 2, 7])
sorted_numbers = numbers.sort_values(ascending=False)
print(numbers)
print(sorted_numbers)

0    1
1   -3
2    2
3    7
dtype: int64
3    7
2    2
0    1
1   -3
dtype: int64


In [44]:
cars = {'name': ['ford torino', 'peugeot 504', 'fiat 124b'],
        'hp': [140, 76, 76],
        'weight': [3449, 2672, 2065]}
cars_frame = DataFrame(cars, columns=['weight', 'hp'],
                             index=cars['name'])
cars_frame.sort_values('hp')                 

Unnamed: 0,weight,hp
peugeot 504,2672,76
fiat 124b,2065,76
ford torino,3449,140


In [45]:
cars_frame.sort_values(['hp', 'weight'], ascending=[True, False])

Unnamed: 0,weight,hp
peugeot 504,2672,76
fiat 124b,2065,76
ford torino,3449,140


In [46]:
cars = {'name': ['ford torino', 'peugeot 504', 'fiat 124b'],
        'hp': [140, 87, 76],
        'weight': [3449, 2672, 2065]}
cars_frame = DataFrame(cars, columns=['weight','hp'],
                             index=cars['name'])

In [47]:
cars_frame.min()

weight    2065
hp          76
dtype: int64

In [48]:
cars_frame_norm = (cars_frame - cars_frame.min()) / (cars_frame.max() - cars_frame.min())

In [49]:
cars_frame_norm

Unnamed: 0,weight,hp
ford torino,1.0,1.0
peugeot 504,0.438584,0.171875
fiat 124b,0.0,0.0


In [50]:
cars_frame.describe()

Unnamed: 0,weight,hp
count,3.0,3.0
mean,2728.666667,101.0
std,693.737943,34.219877
min,2065.0,76.0
25%,2368.5,81.5
50%,2672.0,87.0
75%,3060.5,113.5
max,3449.0,140.0


In [51]:
cars_frame_norm.describe()

Unnamed: 0,weight,hp
count,3.0,3.0
mean,0.479528,0.390625
std,0.501256,0.534686
min,0.0,0.0
25%,0.219292,0.085938
50%,0.438584,0.171875
75%,0.719292,0.585938
max,1.0,1.0


### Импорт данных

In [52]:
all_cars = pd.read_csv('auto-mpg.csv') # Файл auto-mpg.csv находится в той же директории, что и файл Jupyter блокнота

In [53]:
all_cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [54]:
all_cars.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.319095,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,38.409778,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,43.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,75.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,125.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [55]:
all_cars = pd.read_csv('auto-mpg.csv', 
                       delimiter=',', 
                       index_col='car name', 
                       nrows=5, 
                       decimal='.', 
                       usecols=['car name', 'horsepower', 'cylinders', 'weight'])

In [56]:
all_cars.head()

Unnamed: 0_level_0,cylinders,horsepower,weight
car name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chevrolet chevelle malibu,8,130,3504
buick skylark 320,8,165,3693
plymouth satellite,8,150,3436
amc rebel sst,8,150,3433
ford torino,8,140,3449
