-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathBasic statistics.py
329 lines (239 loc) · 11.5 KB
/
Basic statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#Basic Statistics, Graphs and Reports
#Taking a random sample
import pandas as pd
#view all the names(functions) in a module on pd
dir(pd)
####################Sampling in R#############################
#Taking a random sample
import pandas as pd
Online_Retail=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Online Retail Sales Data\\Online Retail.csv", encoding = "ISO-8859-1")
Online_Retail.shape
sample_data=Online_Retail.sample(n=1000)
sample_data.shape
print(sample_data.head())
#Regenerating same sample again
sample_data1=Online_Retail.sample(n=1000 , random_state=12 )
sample_data1.shape
print(sample_data1.head())
#####################LAB: Sampling in python#############################
#Import “Census Income Data/Income_data.csv”
Income=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Census Income Data\\Income_data.csv")
Income.shape
Income.head()
Income.tail(3)
#Sample size 5000
Sample_income=Income.sample(n=5000)
Sample_income.shape
#####################Descriptive statistics#####################
#Import “Census Income Data/Income_data.csv”
Income=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Census Income Data\\Income_data.csv")
Income.columns.values
#Mean and Median on python
gain_mean=Income["capital-gain"].mean()
gain_mean
gain_median=Income["capital-gain"].median()
gain_median
#####################LAB: Mean and Median on python#####################
Online_Retail=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Online_Retail_Sales_Data\\Online Retail.csv", encoding = "ISO-8859-1")
Online_Retail.shape
Online_Retail.columns.values
#Mean and median of 'UnitPrice' in Online Retail data
up_mean=Online_Retail['UnitPrice'].mean()
up_mean
up_median=Online_Retail['UnitPrice'].median()
up_median
#Mean of "Quantity" in Online Retail data
Quantity_mean=Online_Retail['Quantity'].mean()
Quantity_mean
Quantity_median=Online_Retail['Quantity'].median()
Quantity_median
#####################Dispersion Measures#####################
#####################Variance and Standard deviation#####################
usa_income=Income[Income["native-country"]==' United-States']
usa_income.shape
other_income=Income[Income["native-country"]!=' United-States']
other_income.shape
#Var and SD for USA
var_usa=usa_income["education-num"].var()
var_usa
std_usa=usa_income["education-num"].std()
std_usa
var_other=other_income["education-num"].var()
var_other
std_other=other_income["education-num"].std()
std_other
#####################LAB: Variance and Standard deviation#####################
##var and sd UnitPrice
var_UnitPrice=Online_Retail['UnitPrice'].var()
var_UnitPrice
std_UnitPrice=Online_Retail['UnitPrice'].std()
std_UnitPrice
#variance and sd of Quantity
var_UnitPrice=Online_Retail['Quantity'].var()
var_UnitPrice
std_UnitPrice=Online_Retail['Quantity'].std()
std_UnitPrice
######################Percentiles & Quartiles #####################
Income["capital-gain"].describe()
#Finding the percentile & quantile by using .quantile()
Income['capital-gain'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
Income['capital-loss'].quantile([0, 0.1, 0.2, 0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
Income['hours-per-week'].quantile([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.98,1])
######################LAB: Percentiles & quartiles in python######################
bank=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Bank Tele Marketing\\bank_market.csv",encoding = "ISO-8859-1")
bank.shape
#Get the summary of the balance variable
#we can find the summary of the balance variable by using .describe()
summary_bala=bank["balance"].describe()
summary_bala
#Get relevant percentiles and see their distribution.
bank['balance'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
#Get the summary of the age variable
summary_age=bank['age'].describe()
summary_age
#Get relevant percentiles and see their distribution
bank['age'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
######################LAB: Box plots and outlier detection######################
#Do you suspect any outliers in balance
bank=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Bank Tele Marketing\\bank_market.csv",encoding = "ISO-8859-1")
bank.shape
import matplotlib.pyplot as plt
#Basic plot of boxplot by importing the matplot.pyplot as plt ("plt.boxplot())
plt.boxplot(bank.balance)
#Get relevant percentiles and see their distribution
bank['balance'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,0.95, 1])
#Do you suspect any outliers in balance
# outlier are present in balance variable
#Do you suspect any outliers in age
#detect the ouliers in age variable by plt.boxplot()
plt.boxplot(bank.age)
#No outliers are present
#Get relevant percentiles and see their distribution
bank['age'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95,1])
#Do you suspect any outliers in age
#outliers are not present in age variable
######################Creating Graphs ################################
##Scatter Plot:
cars=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Cars Data\\Cars.csv",encoding = "ISO-8859-1")
cars.shape
cars.columns.values
cars['Horsepower'].describe()
cars['MPG_City'].describe()
import matplotlib.pyplot as plt
plt.scatter(cars.Horsepower,cars.MPG_City)
######################LAB:Creating Graphs ################################
import matplotlib.pyplot as plt
#Sports data
sports_data=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Sporting_goods_sales\\Sporting_goods_sales.csv")
sports_data.head(10)
#Draw a scatter plot between Average_Income and Sales. Is there any relation between two variables
plt.scatter(sports_data.Average_Income,sports_data.Sales)
import numpy as np
np.corrcoef(sports_data.Average_Income,sports_data.Sales)
#Draw a scatter plot between Under35_Population_pect and Sales. Is there any relation between two
plt.scatter(sports_data.Under35_Population_pect,sports_data.Sales,color="red")
np.corrcoef(sports_data.Under35_Population_pect,sports_data.Sales)
######################Bar Chart######################
#Bar charts used to summarize the categorical variables
import pandas as pd
cars=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Cars Data\\Cars.csv",encoding = "ISO-8859-1")
cars.shape
cars.columns.values
freq=cars.Cylinders.value_counts()
freq.values
freq.index
import matplotlib.pyplot as plt
plt.bar(freq.index,freq.values)
######################LAB: Bar Chart######################
freq=sports_data.Avg_family_size.value_counts()
freq.values
freq.index
import matplotlib.pyplot as plt
plt.bar(freq.index,freq.values)
plt.bar(freq.index,freq.values, align="center")
plt.bar(freq.index,freq.values, align="center",tick_label=freq.index)
######################Trend Chart######################
AirPassengers=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Air Travel Data\\Air_travel.csv", encoding = "ISO-8859-1")
AirPassengers.head()
AirPassengers.dtypes
AirPassengers.columns.values
import matplotlib.pyplot as plt
plt.plot(AirPassengers.AIR)
#X axis lable
#Format the date to DD-MM-YYYY before importing
AirPassengers['new_time']=pd.to_datetime(AirPassengers['DATE'],format='%d-%m-%Y')
plt.plot(AirPassengers.new_time,AirPassengers.AIR)
# Any single array will give time series plot
plt.plot(sports_data.Avg_family_size)
#Formatted col
################################
## Used defined Functions
def mydistance(x1=1,y1=1,x2=1,y2=1):
import math
dist=math.sqrt(pow((x1-x2),2)+pow((y1-y2),2))
print(dist)
return;
mydistance(x1=0,y1=0,x2=2,y2=2)
mydistance(x1=1,y1=0,x2=0,y2=1)
mydistance(x1=4,y1=6,x2=1,y2=2)
mydistance(4,6,1,2)
##The Absolute percentage difference
x=1
y=1
def abspe(x=1,y=1):
abpe=abs((x-y)/y)
print(abpe)
return;
abspe(x=5,y=9)
abspe(10,100)
###Sum of squares functions
def sumsquares(*inputnums):
s = 0
for n in inputnums:
s =s + pow(n,2)
print(s)
return s;
sumsquares (1,1,1,1,1)
sumsquares (1,2,5,8,-1)
###Function for summary
import pandas as pd
column_names = ["Name","Mean", "Median", "Variance","S.D", "p5",
"p10", "p20", "p25", "p30", "p50", "p75", "p80", "p90", "p95", "p97", "p99"]
summary_df=pd.DataFrame(columns=column_names)
def allsummary(df):
i=1
for f in df.columns.values:
summary_df.set_value(i,"Name",f)
summary_df.set_value(i, "Mean",df[f].mean())
summary_df.set_value(i, "Median",df[f].median())
summary_df.set_value(i, "Variance",df[f].var())
summary_df.set_value(i, "S.D",df[f].std())
summary_df.set_value(i, "p5",pd.notnull(df[f]).quantile(0.1))
summary_df.set_value(i, "p10",df[f].dropna(axis=0).quantile(0.1))
summary_df.set_value(i, "p20",df[f].dropna(axis=0).quantile(0.2))
summary_df.set_value(i, "p25",df[f].dropna(axis=0).quantile(0.25))
summary_df.set_value(i, "p30",df[f].dropna(axis=0).quantile(0.3))
summary_df.set_value(i, "p50",df[f].dropna(axis=0).quantile(0.5))
summary_df.set_value(i, "p75",df[f].dropna(axis=0).quantile(0.75))
summary_df.set_value(i, "p80",df[f].dropna(axis=0).quantile(0.8))
summary_df.set_value(i, "p90",df[f].dropna(axis=0).quantile(0.9))
summary_df.set_value(i, "p95",df[f].dropna(axis=0).quantile(0.95))
summary_df.set_value(i, "p97",df[f].dropna(axis=0).quantile(0.97))
summary_df.set_value(i, "p99",df[f].dropna(axis=0).quantile(0.99))
i=i+1;
print(summary_df)
credit_risk=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Give me some Credit\\cs-training.csv", encoding = "ISO-8859-1")
allsummary(credit_risk)
###How dropna(axis=0) works
###dropna expects a dataframe as input.
### Axis=1 drops coloumns with NA values
### Axis=0 drops rows with NA values
import numpy as np
df = pd.DataFrame(np.random.randn(5, 3), columns=['one', 'two', 'three'])
df1=df.reindex([0,1,2,3,4,5,6,7])
df1["colfour"]=4
print(df1)
df1[["one","colfour"]]
df1[["one","colfour"]].dropna(axis=0)
df1[["one","colfour"]]
df1[["one","colfour"]].dropna(axis=1)