1+ #Basic Statistics, Graphs and Reports
2+ #Taking a random sample
3+ import pandas as pd
4+ #view all the names(functions) in a module on pd
5+ dir (pd )
6+
7+ ####################Sampling in R#############################
8+ #Taking a random sample
9+ import pandas as pd
10+
11+ Online_Retail = pd .read_csv ("E:\\ Larning\\ hadoop\\ Data Science\\ 001_Python\\ Class Files Python\\ Class Files Python\\ 1.Python Programming\\ 3.Basic Statistics and Reporting in Python\\ datasets\\ Online Retail Sales Data\\ Online Retail.csv" , encoding = "ISO-8859-1" )
12+ Online_Retail .shape
13+
14+ sample_data = Online_Retail .sample (n = 1000 )
15+ sample_data .shape
16+ print (sample_data .head ())
17+
18+ #Regenerating same sample again
19+
20+ sample_data1 = Online_Retail .sample (n = 1000 , random_state = 12 )
21+ sample_data1 .shape
22+ print (sample_data1 .head ())
23+
24+ #####################LAB: Sampling in python#############################
25+
26+ #Import “Census Income Data/Income_data.csv”
27+ Income = pd .read_csv ("E:\\ Larning\\ hadoop\\ Data Science\\ 001_Python\\ Class Files Python\\ Class Files Python\\ 1.Python Programming\\ 3.Basic Statistics and Reporting in Python\\ datasets\\ Census Income Data\\ Income_data.csv" )
28+ Income .shape
29+ Income .head ()
30+ Income .tail (3 )
31+ #Sample size 5000
32+ Sample_income = Income .sample (n = 5000 )
33+ Sample_income .shape
34+
35+ #####################Descriptive statistics#####################
36+ #Import “Census Income Data/Income_data.csv”
37+ Income = pd .read_csv ("E:\\ Larning\\ hadoop\\ Data Science\\ 001_Python\\ Class Files Python\\ Class Files Python\\ 1.Python Programming\\ 3.Basic Statistics and Reporting in Python\\ datasets\\ Census Income Data\\ Income_data.csv" )
38+
39+ Income .columns .values
40+
41+ #Mean and Median on python
42+ gain_mean = Income ["capital-gain" ].mean ()
43+ gain_mean
44+
45+ gain_median = Income ["capital-gain" ].median ()
46+ gain_median
47+
48+ #####################LAB: Mean and Median on python#####################
49+
50+ Online_Retail = pd .read_csv ("E:\\ Larning\\ hadoop\\ Data Science\\ 001_Python\\ Class Files Python\\ Class Files Python\\ 1.Python Programming\\ 3.Basic Statistics and Reporting in Python\\ datasets\\ Online_Retail_Sales_Data\\ Online Retail.csv" , encoding = "ISO-8859-1" )
51+ Online_Retail .shape
52+ Online_Retail .columns .values
53+
54+ #Mean and median of 'UnitPrice' in Online Retail data
55+ up_mean = Online_Retail ['UnitPrice' ].mean ()
56+ up_mean
57+
58+ up_median = Online_Retail ['UnitPrice' ].median ()
59+ up_median
60+
61+ #Mean of "Quantity" in Online Retail data
62+ Quantity_mean = Online_Retail ['Quantity' ].mean ()
63+ Quantity_mean
64+
65+ Quantity_median = Online_Retail ['Quantity' ].median ()
66+ Quantity_median
67+
68+ #####################Dispersion Measures#####################
69+
70+ #####################Variance and Standard deviation#####################
71+ usa_income = Income [Income ["native-country" ]== ' United-States' ]
72+ usa_income .shape
73+
74+ other_income = Income [Income ["native-country" ]!= ' United-States' ]
75+ other_income .shape
76+
77+ #Var and SD for USA
78+ var_usa = usa_income ["education-num" ].var ()
79+ var_usa
80+
81+ std_usa = usa_income ["education-num" ].std ()
82+ std_usa
83+
84+ var_other = other_income ["education-num" ].var ()
85+ var_other
86+
87+ std_other = other_income ["education-num" ].std ()
88+ std_other
89+
90+ #####################LAB: Variance and Standard deviation#####################
91+ ##var and sd UnitPrice
92+ var_UnitPrice = Online_Retail ['UnitPrice' ].var ()
93+ var_UnitPrice
94+
95+ std_UnitPrice = Online_Retail ['UnitPrice' ].std ()
96+ std_UnitPrice
97+
98+ #variance and sd of Quantity
99+ var_UnitPrice = Online_Retail ['Quantity' ].var ()
100+ var_UnitPrice
101+
102+ std_UnitPrice = Online_Retail ['Quantity' ].std ()
103+ std_UnitPrice
104+
105+ ######################Percentiles & Quartiles #####################
106+
107+ Income ["capital-gain" ].describe ()
108+
109+ #Finding the percentile & quantile by using .quantile()
110+ Income ['capital-gain' ].quantile ([0 , 0.1 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.7 , 0.8 , 0.9 , 1 ])
111+ Income ['capital-loss' ].quantile ([0 , 0.1 , 0.2 , 0.3 ,0.4 ,0.5 ,0.6 ,0.7 ,0.8 ,0.9 ,1 ])
112+ Income ['hours-per-week' ].quantile ([0 ,0.1 ,0.2 ,0.3 ,0.4 ,0.5 ,0.6 ,0.7 ,0.8 ,0.9 ,0.95 ,0.98 ,1 ])
113+
114+ ######################LAB: Percentiles & quartiles in python######################
115+ bank = pd .read_csv ("E:\\ Larning\\ hadoop\\ Data Science\\ 001_Python\\ Class Files Python\\ Class Files Python\\ 1.Python Programming\\ 3.Basic Statistics and Reporting in Python\\ datasets\\ Bank Tele Marketing\\ bank_market.csv" ,encoding = "ISO-8859-1" )
116+ bank .shape
117+
118+ #Get the summary of the balance variable
119+ #we can find the summary of the balance variable by using .describe()
120+ summary_bala = bank ["balance" ].describe ()
121+ summary_bala
122+
123+ #Get relevant percentiles and see their distribution.
124+ bank ['balance' ].quantile ([0 , 0.1 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.7 , 0.8 , 0.9 , 1 ])
125+
126+ #Get the summary of the age variable
127+ summary_age = bank ['age' ].describe ()
128+ summary_age
129+
130+ #Get relevant percentiles and see their distribution
131+ bank ['age' ].quantile ([0 , 0.1 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.7 , 0.8 , 0.9 , 1 ])
132+
133+ ######################LAB: Box plots and outlier detection######################
134+ #Do you suspect any outliers in balance
135+ bank = pd .read_csv ("E:\\ Larning\\ hadoop\\ Data Science\\ 001_Python\\ Class Files Python\\ Class Files Python\\ 1.Python Programming\\ 3.Basic Statistics and Reporting in Python\\ datasets\\ Bank Tele Marketing\\ bank_market.csv" ,encoding = "ISO-8859-1" )
136+ bank .shape
137+
138+ import matplotlib .pyplot as plt
139+
140+ #Basic plot of boxplot by importing the matplot.pyplot as plt ("plt.boxplot())
141+ plt .boxplot (bank .balance )
142+
143+ #Get relevant percentiles and see their distribution
144+ bank ['balance' ].quantile ([0 , 0.1 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.7 , 0.8 , 0.9 ,0.95 , 1 ])
145+ #Do you suspect any outliers in balance
146+ # outlier are present in balance variable
147+
148+ #Do you suspect any outliers in age
149+ #detect the ouliers in age variable by plt.boxplot()
150+ plt .boxplot (bank .age )
151+ #No outliers are present
152+ #Get relevant percentiles and see their distribution
153+ bank ['age' ].quantile ([0 , 0.1 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.7 , 0.8 , 0.9 , 0.95 ,1 ])
154+ #Do you suspect any outliers in age
155+ #outliers are not present in age variable
156+
157+
158+ ######################Creating Graphs ################################
159+
160+ ##Scatter Plot:
161+
162+ cars = pd .read_csv ("E:\\ Larning\\ hadoop\\ Data Science\\ 001_Python\\ Class Files Python\\ Class Files Python\\ 1.Python Programming\\ 3.Basic Statistics and Reporting in Python\\ datasets\\ Cars Data\\ Cars.csv" ,encoding = "ISO-8859-1" )
163+ cars .shape
164+ cars .columns .values
165+
166+ cars ['Horsepower' ].describe ()
167+ cars ['MPG_City' ].describe ()
168+
169+ import matplotlib .pyplot as plt
170+ plt .scatter (cars .Horsepower ,cars .MPG_City )
171+
172+
173+ ######################LAB:Creating Graphs ################################
174+
175+ import matplotlib .pyplot as plt
176+
177+
178+ #Sports data
179+ sports_data = pd .read_csv ("E:\\ Larning\\ hadoop\\ Data Science\\ 001_Python\\ Class Files Python\\ Class Files Python\\ 1.Python Programming\\ 3.Basic Statistics and Reporting in Python\\ datasets\\ Sporting_goods_sales\\ Sporting_goods_sales.csv" )
180+ sports_data .head (10 )
181+
182+ #Draw a scatter plot between Average_Income and Sales. Is there any relation between two variables
183+ plt .scatter (sports_data .Average_Income ,sports_data .Sales )
184+
185+ import numpy as np
186+ np .corrcoef (sports_data .Average_Income ,sports_data .Sales )
187+
188+ #Draw a scatter plot between Under35_Population_pect and Sales. Is there any relation between two
189+ plt .scatter (sports_data .Under35_Population_pect ,sports_data .Sales ,color = "red" )
190+ np .corrcoef (sports_data .Under35_Population_pect ,sports_data .Sales )
191+
192+ ######################Bar Chart######################
193+ #Bar charts used to summarize the categorical variables
194+
195+ import pandas as pd
196+
197+ cars = pd .read_csv ("E:\\ Larning\\ hadoop\\ Data Science\\ 001_Python\\ Class Files Python\\ Class Files Python\\ 1.Python Programming\\ 3.Basic Statistics and Reporting in Python\\ datasets\\ Cars Data\\ Cars.csv" ,encoding = "ISO-8859-1" )
198+ cars .shape
199+ cars .columns .values
200+
201+ freq = cars .Cylinders .value_counts ()
202+ freq .values
203+ freq .index
204+
205+ import matplotlib .pyplot as plt
206+ plt .bar (freq .index ,freq .values )
207+ ######################LAB: Bar Chart######################
208+
209+ freq = sports_data .Avg_family_size .value_counts ()
210+ freq .values
211+ freq .index
212+
213+ import matplotlib .pyplot as plt
214+ plt .bar (freq .index ,freq .values )
215+ plt .bar (freq .index ,freq .values , align = "center" )
216+ plt .bar (freq .index ,freq .values , align = "center" ,tick_label = freq .index )
217+
218+
219+ ######################Trend Chart######################
220+
221+ AirPassengers = pd .read_csv ("E:\\ Larning\\ hadoop\\ Data Science\\ 001_Python\\ Class Files Python\\ Class Files Python\\ 1.Python Programming\\ 3.Basic Statistics and Reporting in Python\\ datasets\\ Air Travel Data\\ Air_travel.csv" , encoding = "ISO-8859-1" )
222+ AirPassengers .head ()
223+ AirPassengers .dtypes
224+ AirPassengers .columns .values
225+
226+ import matplotlib .pyplot as plt
227+ plt .plot (AirPassengers .AIR )
228+
229+
230+ #X axis lable
231+ #Format the date to DD-MM-YYYY before importing
232+ AirPassengers ['new_time' ]= pd .to_datetime (AirPassengers ['DATE' ],format = '%d-%m-%Y' )
233+ plt .plot (AirPassengers .new_time ,AirPassengers .AIR )
234+
235+ # Any single array will give time series plot
236+ plt .plot (sports_data .Avg_family_size )
237+ #Formatted col
238+
239+
240+ ################################
241+ ## Used defined Functions
242+
243+ def mydistance (x1 = 1 ,y1 = 1 ,x2 = 1 ,y2 = 1 ):
244+ import math
245+ dist = math .sqrt (pow ((x1 - x2 ),2 )+ pow ((y1 - y2 ),2 ))
246+ print (dist )
247+ return ;
248+
249+ mydistance (x1 = 0 ,y1 = 0 ,x2 = 2 ,y2 = 2 )
250+ mydistance (x1 = 1 ,y1 = 0 ,x2 = 0 ,y2 = 1 )
251+ mydistance (x1 = 4 ,y1 = 6 ,x2 = 1 ,y2 = 2 )
252+ mydistance (4 ,6 ,1 ,2 )
253+
254+ ##The Absolute percentage difference
255+
256+ x = 1
257+ y = 1
258+
259+ def abspe (x = 1 ,y = 1 ):
260+ abpe = abs ((x - y )/ y )
261+ print (abpe )
262+ return ;
263+
264+ abspe (x = 5 ,y = 9 )
265+ abspe (10 ,100 )
266+
267+ ###Sum of squares functions
268+
269+ def sumsquares (* inputnums ):
270+ s = 0
271+ for n in inputnums :
272+ s = s + pow (n ,2 )
273+ print (s )
274+ return s ;
275+
276+
277+ sumsquares (1 ,1 ,1 ,1 ,1 )
278+ sumsquares (1 ,2 ,5 ,8 ,- 1 )
279+
280+ ###Function for summary
281+ import pandas as pd
282+ column_names = ["Name" ,"Mean" , "Median" , "Variance" ,"S.D" , "p5" ,
283+ "p10" , "p20" , "p25" , "p30" , "p50" , "p75" , "p80" , "p90" , "p95" , "p97" , "p99" ]
284+ summary_df = pd .DataFrame (columns = column_names )
285+
286+ def allsummary (df ):
287+ i = 1
288+ for f in df .columns .values :
289+ summary_df .set_value (i ,"Name" ,f )
290+ summary_df .set_value (i , "Mean" ,df [f ].mean ())
291+ summary_df .set_value (i , "Median" ,df [f ].median ())
292+ summary_df .set_value (i , "Variance" ,df [f ].var ())
293+ summary_df .set_value (i , "S.D" ,df [f ].std ())
294+ summary_df .set_value (i , "p5" ,pd .notnull (df [f ]).quantile (0.1 ))
295+ summary_df .set_value (i , "p10" ,df [f ].dropna (axis = 0 ).quantile (0.1 ))
296+ summary_df .set_value (i , "p20" ,df [f ].dropna (axis = 0 ).quantile (0.2 ))
297+ summary_df .set_value (i , "p25" ,df [f ].dropna (axis = 0 ).quantile (0.25 ))
298+ summary_df .set_value (i , "p30" ,df [f ].dropna (axis = 0 ).quantile (0.3 ))
299+ summary_df .set_value (i , "p50" ,df [f ].dropna (axis = 0 ).quantile (0.5 ))
300+ summary_df .set_value (i , "p75" ,df [f ].dropna (axis = 0 ).quantile (0.75 ))
301+ summary_df .set_value (i , "p80" ,df [f ].dropna (axis = 0 ).quantile (0.8 ))
302+ summary_df .set_value (i , "p90" ,df [f ].dropna (axis = 0 ).quantile (0.9 ))
303+ summary_df .set_value (i , "p95" ,df [f ].dropna (axis = 0 ).quantile (0.95 ))
304+ summary_df .set_value (i , "p97" ,df [f ].dropna (axis = 0 ).quantile (0.97 ))
305+ summary_df .set_value (i , "p99" ,df [f ].dropna (axis = 0 ).quantile (0.99 ))
306+ i = i + 1 ;
307+ print (summary_df )
308+
309+ credit_risk = pd .read_csv ("E:\\ Larning\\ hadoop\\ Data Science\\ 001_Python\\ Class Files Python\\ Class Files Python\\ 1.Python Programming\\ 3.Basic Statistics and Reporting in Python\\ datasets\\ Give me some Credit\\ cs-training.csv" , encoding = "ISO-8859-1" )
310+
311+ allsummary (credit_risk )
312+
313+ ###How dropna(axis=0) works
314+ ###dropna expects a dataframe as input.
315+ ### Axis=1 drops coloumns with NA values
316+ ### Axis=0 drops rows with NA values
317+
318+ import numpy as np
319+ df = pd .DataFrame (np .random .randn (5 , 3 ), columns = ['one' , 'two' , 'three' ])
320+ df1 = df .reindex ([0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 ])
321+ df1 ["colfour" ]= 4
322+
323+ print (df1 )
324+
325+ df1 [["one" ,"colfour" ]]
326+ df1 [["one" ,"colfour" ]].dropna (axis = 0 )
327+
328+ df1 [["one" ,"colfour" ]]
329+ df1 [["one" ,"colfour" ]].dropna (axis = 1 )
0 commit comments