**Полезные методы в Pandas**

In [8]:
import numpy as np
import pandas as pd

In [9]:
df = pd.read_csv("/content/drive/MyDrive/Data/tips.csv")

In [10]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


In [11]:
str(3560325168603410)[-4:]

'3410'

In [12]:
def last_four(num):
  return str(num)[-4:]

In [13]:
last_four(6011812112971322)

'1322'

In [14]:
df["last_four"] = df["CC Number"].apply(last_four)

In [15]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,last_four
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,3410
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,9230
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,1322
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,5994
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251,7221


In [16]:
df["total_bill"].mean()

np.float64(19.78594262295082)

In [17]:
def yelp(price):
  if price < 10:
    return '$'
  elif price >= 10 and price < 30:
    return '$$'
  else:
    return '$$$'

In [18]:
df["yelp"] = df["total_bill"].apply(yelp)

In [19]:
df["yelp"]

Unnamed: 0,yelp
0,$$
1,$$
2,$$
3,$$
4,$$
...,...
239,$$
240,$$
241,$$
242,$$


In [20]:
def simple(num):
  return num*2

In [21]:
lambda num: num*2

<function __main__.<lambda>(num)>

In [23]:
df["total_bill"].apply(lambda num: num*2)

Unnamed: 0,total_bill
0,33.98
1,20.68
2,42.02
3,47.36
4,49.18
...,...
239,58.06
240,54.36
241,45.34
242,35.64


In [24]:
def quality(total_bill, tip):
  if tip / total_bill > 0.25:
    return "Щедрые чаевые"
  else:
    return "Обычные чаевые"

In [25]:
quality(16.99, 1.01)

'Обычные чаевые'

In [27]:
df["quality"] = df[["total_bill", "tip"]].apply(lambda df: quality(df["total_bill"], df["tip"]), axis=1)

In [28]:
df["quality"] = np.vectorize(quality)(df["total_bill"], df["tip"])

In [29]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,last_four,yelp,quality
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,3410,$$,Обычные чаевые
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,9230,$$,Обычные чаевые
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,1322,$$,Обычные чаевые
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,5994,$$,Обычные чаевые
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251,7221,$$,Обычные чаевые


In [30]:
df = pd.read_csv("/content/drive/MyDrive/Data/tips.csv")

In [31]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.78594,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9510998,1.0,2.0,2.0,3.0,6.0
price_per_person,244.0,7.888197,2.914234,2.88,5.8,7.255,9.39,20.27
CC Number,244.0,2563496000000000.0,2369340000000000.0,60406790000.0,30407310000000.0,3525318000000000.0,4553675000000000.0,6596454000000000.0


In [33]:
df.sort_values("tip", ascending=False)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
170,50.81,10.00,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Sat1954
212,48.33,9.00,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590
23,39.42,7.58,Male,No,Sat,Dinner,4,9.86,Lance Peterson,3542584061609808,Sat239
59,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Sat8139
141,34.30,6.70,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508,Thur1025
...,...,...,...,...,...,...,...,...,...,...,...
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
111,7.25,1.00,Female,No,Sat,Dinner,1,7.25,Terri Jones,3559221007826887,Sat4801
92,5.75,1.00,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Fri3780
67,3.07,1.00,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Sat3455


In [34]:
df.sort_values(["tip", "size"])

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
67,3.07,1.00,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Sat3455
111,7.25,1.00,Female,No,Sat,Dinner,1,7.25,Terri Jones,3559221007826887,Sat4801
92,5.75,1.00,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Fri3780
236,12.60,1.00,Male,Yes,Sat,Dinner,2,6.30,Matthew Myers,3543676378973965,Sat5032
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
...,...,...,...,...,...,...,...,...,...,...,...
141,34.30,6.70,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508,Thur1025
59,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Sat8139
23,39.42,7.58,Male,No,Sat,Dinner,4,9.86,Lance Peterson,3542584061609808,Sat239
212,48.33,9.00,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590


In [35]:
df["total_bill"].max()

50.81

In [36]:
df["total_bill"].idxmax()

170

In [40]:
df.iloc[df["total_bill"].idxmin()]

Unnamed: 0,67
total_bill,3.07
tip,1.0
sex,Female
smoker,Yes
day,Sat
time,Dinner
size,1
price_per_person,3.07
Payer Name,Tiffany Brock
CC Number,4359488526995267


In [41]:
df.corr(numeric_only=True)

Unnamed: 0,total_bill,tip,size,price_per_person,CC Number
total_bill,1.0,0.675734,0.598315,0.647554,0.104576
tip,0.675734,1.0,0.489299,0.347405,0.110857
size,0.598315,0.489299,1.0,-0.175359,-0.030239
price_per_person,0.647554,0.347405,-0.175359,1.0,0.13524
CC Number,0.104576,0.110857,-0.030239,0.13524,1.0


In [43]:
df["sex"].value_counts()

Unnamed: 0_level_0,count
sex,Unnamed: 1_level_1
Male,157
Female,87


In [44]:
df["day"].unique()

array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)

In [45]:
df["day"].nunique()

4

In [47]:
df["sex"].replace(["Female", "Male"], ["F", "M"])

Unnamed: 0,sex
0,F
1,M
2,M
3,M
4,F
...,...
239,M
240,F
241,M
242,M


In [48]:
my_map = {"Female": "F", "Male": "M"}

In [49]:
df["sex"].map(my_map)

Unnamed: 0,sex
0,F
1,M
2,M
3,M
4,F
...,...
239,M
240,F
241,M
242,M


In [52]:
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
239,False
240,False
241,False
242,False


In [53]:
simple_df = pd.DataFrame([1, 2, 2, 2], ['a', 'b', 'c', 'd'])

In [54]:
simple_df.duplicated()

Unnamed: 0,0
a,False
b,False
c,True
d,True


In [55]:
simple_df.drop_duplicates()

Unnamed: 0,0
a,1
b,2


In [59]:
df[df["total_bill"].between(10, 20, inclusive="both")]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
8,15.04,1.96,Male,No,Sun,Dinner,2,7.52,Joseph Mcdonald,3522866365840377,Sun6820
9,14.78,3.23,Male,No,Sun,Dinner,2,7.39,Jerome Abbott,3532124519049786,Sun3775
10,10.27,1.71,Male,No,Sun,Dinner,2,5.14,William Riley,566287581219,Sun2546
...,...,...,...,...,...,...,...,...,...,...,...
234,15.53,3.00,Male,Yes,Sat,Dinner,2,7.76,Tracy Douglas,4097938155941930,Sat7220
235,10.07,1.25,Male,No,Sat,Dinner,2,5.04,Sean Gonzalez,3534021246117605,Sat4615
236,12.60,1.00,Male,Yes,Sat,Dinner,2,6.30,Matthew Myers,3543676378973965,Sat5032
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950,Sat17


In [61]:
df.nlargest(2, "tip")

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
170,50.81,10.0,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Sat1954
212,48.33,9.0,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590


In [62]:
df.sort_values("tip", ascending=False).iloc[0:2]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
170,50.81,10.0,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Sat1954
212,48.33,9.0,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590


In [63]:
df.nsmallest(2, "tip")

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
67,3.07,1.0,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Sat3455
92,5.75,1.0,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Fri3780


In [66]:
df.sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
233,10.77,1.47,Male,No,Sat,Dinner,2,5.38,Paul Novak,6011698897610858,Sat1467
151,13.13,2.0,Male,No,Sun,Dinner,2,6.56,Jason Arnold,3571825125296106,Sun2127
15,21.58,3.92,Male,No,Sun,Dinner,2,10.79,Matthew Reilly,180073029785069,Sun1878
180,34.65,3.68,Male,Yes,Sun,Dinner,4,8.66,James Hebert DDS,676168737648,Sun7544
56,38.01,3.0,Male,Yes,Sat,Dinner,4,9.5,James Christensen DDS,349793629453226,Sat8903


In [69]:
df.sample(frac=0.1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
125,29.8,4.2,Female,No,Thur,Lunch,6,4.97,Angela Sanchez,503857080488,Thur3948
45,18.29,3.0,Male,No,Sun,Dinner,2,9.14,Richard Fitzgerald,375156610762053,Sun8643
54,25.56,4.34,Male,No,Sun,Dinner,4,6.39,Ronald Owens,6569607991983380,Sun9470
26,13.37,2.0,Male,No,Sat,Dinner,2,6.68,Kyle Avery,6531339539615499,Sat6651
207,38.73,3.0,Male,Yes,Sat,Dinner,4,9.68,Ricky Ramirez,347817964484033,Sat4505
224,13.42,1.58,Male,Yes,Fri,Lunch,2,6.71,Ronald Vaughn DVM,341503466406403,Fri5959
112,38.07,4.0,Male,No,Sun,Dinner,3,12.69,Jeff Lopez,3572865915176463,Sun591
176,17.89,2.0,Male,Yes,Sun,Dinner,2,8.94,Walter Simmons,6011481578696110,Sun5961
106,20.49,4.06,Male,Yes,Sat,Dinner,2,10.24,Karl Mcdaniel,180024452771522,Sat7865
88,24.71,5.85,Male,No,Thur,Lunch,2,12.36,Roger Taylor,4410248629955,Thur9003
