Skip to content

Commit a18cee1

Browse files
authored
Add files via upload
1 parent e3eb2d2 commit a18cee1

File tree

1 file changed

+329
-0
lines changed

1 file changed

+329
-0
lines changed

Basic statistics.py

Lines changed: 329 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
#Basic Statistics, Graphs and Reports
2+
#Taking a random sample
3+
import pandas as pd
4+
#view all the names(functions) in a module on pd
5+
dir(pd)
6+
7+
####################Sampling in R#############################
8+
#Taking a random sample
9+
import pandas as pd
10+
11+
Online_Retail=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Online Retail Sales Data\\Online Retail.csv", encoding = "ISO-8859-1")
12+
Online_Retail.shape
13+
14+
sample_data=Online_Retail.sample(n=1000)
15+
sample_data.shape
16+
print(sample_data.head())
17+
18+
#Regenerating same sample again
19+
20+
sample_data1=Online_Retail.sample(n=1000 , random_state=12 )
21+
sample_data1.shape
22+
print(sample_data1.head())
23+
24+
#####################LAB: Sampling in python#############################
25+
26+
#Import “Census Income Data/Income_data.csv”
27+
Income=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Census Income Data\\Income_data.csv")
28+
Income.shape
29+
Income.head()
30+
Income.tail(3)
31+
#Sample size 5000
32+
Sample_income=Income.sample(n=5000)
33+
Sample_income.shape
34+
35+
#####################Descriptive statistics#####################
36+
#Import “Census Income Data/Income_data.csv”
37+
Income=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Census Income Data\\Income_data.csv")
38+
39+
Income.columns.values
40+
41+
#Mean and Median on python
42+
gain_mean=Income["capital-gain"].mean()
43+
gain_mean
44+
45+
gain_median=Income["capital-gain"].median()
46+
gain_median
47+
48+
#####################LAB: Mean and Median on python#####################
49+
50+
Online_Retail=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Online_Retail_Sales_Data\\Online Retail.csv", encoding = "ISO-8859-1")
51+
Online_Retail.shape
52+
Online_Retail.columns.values
53+
54+
#Mean and median of 'UnitPrice' in Online Retail data
55+
up_mean=Online_Retail['UnitPrice'].mean()
56+
up_mean
57+
58+
up_median=Online_Retail['UnitPrice'].median()
59+
up_median
60+
61+
#Mean of "Quantity" in Online Retail data
62+
Quantity_mean=Online_Retail['Quantity'].mean()
63+
Quantity_mean
64+
65+
Quantity_median=Online_Retail['Quantity'].median()
66+
Quantity_median
67+
68+
#####################Dispersion Measures#####################
69+
70+
#####################Variance and Standard deviation#####################
71+
usa_income=Income[Income["native-country"]==' United-States']
72+
usa_income.shape
73+
74+
other_income=Income[Income["native-country"]!=' United-States']
75+
other_income.shape
76+
77+
#Var and SD for USA
78+
var_usa=usa_income["education-num"].var()
79+
var_usa
80+
81+
std_usa=usa_income["education-num"].std()
82+
std_usa
83+
84+
var_other=other_income["education-num"].var()
85+
var_other
86+
87+
std_other=other_income["education-num"].std()
88+
std_other
89+
90+
#####################LAB: Variance and Standard deviation#####################
91+
##var and sd UnitPrice
92+
var_UnitPrice=Online_Retail['UnitPrice'].var()
93+
var_UnitPrice
94+
95+
std_UnitPrice=Online_Retail['UnitPrice'].std()
96+
std_UnitPrice
97+
98+
#variance and sd of Quantity
99+
var_UnitPrice=Online_Retail['Quantity'].var()
100+
var_UnitPrice
101+
102+
std_UnitPrice=Online_Retail['Quantity'].std()
103+
std_UnitPrice
104+
105+
######################Percentiles & Quartiles #####################
106+
107+
Income["capital-gain"].describe()
108+
109+
#Finding the percentile & quantile by using .quantile()
110+
Income['capital-gain'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
111+
Income['capital-loss'].quantile([0, 0.1, 0.2, 0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
112+
Income['hours-per-week'].quantile([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.98,1])
113+
114+
######################LAB: Percentiles & quartiles in python######################
115+
bank=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Bank Tele Marketing\\bank_market.csv",encoding = "ISO-8859-1")
116+
bank.shape
117+
118+
#Get the summary of the balance variable
119+
#we can find the summary of the balance variable by using .describe()
120+
summary_bala=bank["balance"].describe()
121+
summary_bala
122+
123+
#Get relevant percentiles and see their distribution.
124+
bank['balance'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
125+
126+
#Get the summary of the age variable
127+
summary_age=bank['age'].describe()
128+
summary_age
129+
130+
#Get relevant percentiles and see their distribution
131+
bank['age'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
132+
133+
######################LAB: Box plots and outlier detection######################
134+
#Do you suspect any outliers in balance
135+
bank=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Bank Tele Marketing\\bank_market.csv",encoding = "ISO-8859-1")
136+
bank.shape
137+
138+
import matplotlib.pyplot as plt
139+
140+
#Basic plot of boxplot by importing the matplot.pyplot as plt ("plt.boxplot())
141+
plt.boxplot(bank.balance)
142+
143+
#Get relevant percentiles and see their distribution
144+
bank['balance'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,0.95, 1])
145+
#Do you suspect any outliers in balance
146+
# outlier are present in balance variable
147+
148+
#Do you suspect any outliers in age
149+
#detect the ouliers in age variable by plt.boxplot()
150+
plt.boxplot(bank.age)
151+
#No outliers are present
152+
#Get relevant percentiles and see their distribution
153+
bank['age'].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95,1])
154+
#Do you suspect any outliers in age
155+
#outliers are not present in age variable
156+
157+
158+
######################Creating Graphs ################################
159+
160+
##Scatter Plot:
161+
162+
cars=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Cars Data\\Cars.csv",encoding = "ISO-8859-1")
163+
cars.shape
164+
cars.columns.values
165+
166+
cars['Horsepower'].describe()
167+
cars['MPG_City'].describe()
168+
169+
import matplotlib.pyplot as plt
170+
plt.scatter(cars.Horsepower,cars.MPG_City)
171+
172+
173+
######################LAB:Creating Graphs ################################
174+
175+
import matplotlib.pyplot as plt
176+
177+
178+
#Sports data
179+
sports_data=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Sporting_goods_sales\\Sporting_goods_sales.csv")
180+
sports_data.head(10)
181+
182+
#Draw a scatter plot between Average_Income and Sales. Is there any relation between two variables
183+
plt.scatter(sports_data.Average_Income,sports_data.Sales)
184+
185+
import numpy as np
186+
np.corrcoef(sports_data.Average_Income,sports_data.Sales)
187+
188+
#Draw a scatter plot between Under35_Population_pect and Sales. Is there any relation between two
189+
plt.scatter(sports_data.Under35_Population_pect,sports_data.Sales,color="red")
190+
np.corrcoef(sports_data.Under35_Population_pect,sports_data.Sales)
191+
192+
######################Bar Chart######################
193+
#Bar charts used to summarize the categorical variables
194+
195+
import pandas as pd
196+
197+
cars=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Cars Data\\Cars.csv",encoding = "ISO-8859-1")
198+
cars.shape
199+
cars.columns.values
200+
201+
freq=cars.Cylinders.value_counts()
202+
freq.values
203+
freq.index
204+
205+
import matplotlib.pyplot as plt
206+
plt.bar(freq.index,freq.values)
207+
######################LAB: Bar Chart######################
208+
209+
freq=sports_data.Avg_family_size.value_counts()
210+
freq.values
211+
freq.index
212+
213+
import matplotlib.pyplot as plt
214+
plt.bar(freq.index,freq.values)
215+
plt.bar(freq.index,freq.values, align="center")
216+
plt.bar(freq.index,freq.values, align="center",tick_label=freq.index)
217+
218+
219+
######################Trend Chart######################
220+
221+
AirPassengers=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Air Travel Data\\Air_travel.csv", encoding = "ISO-8859-1")
222+
AirPassengers.head()
223+
AirPassengers.dtypes
224+
AirPassengers.columns.values
225+
226+
import matplotlib.pyplot as plt
227+
plt.plot(AirPassengers.AIR)
228+
229+
230+
#X axis lable
231+
#Format the date to DD-MM-YYYY before importing
232+
AirPassengers['new_time']=pd.to_datetime(AirPassengers['DATE'],format='%d-%m-%Y')
233+
plt.plot(AirPassengers.new_time,AirPassengers.AIR)
234+
235+
# Any single array will give time series plot
236+
plt.plot(sports_data.Avg_family_size)
237+
#Formatted col
238+
239+
240+
################################
241+
## Used defined Functions
242+
243+
def mydistance(x1=1,y1=1,x2=1,y2=1):
244+
import math
245+
dist=math.sqrt(pow((x1-x2),2)+pow((y1-y2),2))
246+
print(dist)
247+
return;
248+
249+
mydistance(x1=0,y1=0,x2=2,y2=2)
250+
mydistance(x1=1,y1=0,x2=0,y2=1)
251+
mydistance(x1=4,y1=6,x2=1,y2=2)
252+
mydistance(4,6,1,2)
253+
254+
##The Absolute percentage difference
255+
256+
x=1
257+
y=1
258+
259+
def abspe(x=1,y=1):
260+
abpe=abs((x-y)/y)
261+
print(abpe)
262+
return;
263+
264+
abspe(x=5,y=9)
265+
abspe(10,100)
266+
267+
###Sum of squares functions
268+
269+
def sumsquares(*inputnums):
270+
s = 0
271+
for n in inputnums:
272+
s =s + pow(n,2)
273+
print(s)
274+
return s;
275+
276+
277+
sumsquares (1,1,1,1,1)
278+
sumsquares (1,2,5,8,-1)
279+
280+
###Function for summary
281+
import pandas as pd
282+
column_names = ["Name","Mean", "Median", "Variance","S.D", "p5",
283+
"p10", "p20", "p25", "p30", "p50", "p75", "p80", "p90", "p95", "p97", "p99"]
284+
summary_df=pd.DataFrame(columns=column_names)
285+
286+
def allsummary(df):
287+
i=1
288+
for f in df.columns.values:
289+
summary_df.set_value(i,"Name",f)
290+
summary_df.set_value(i, "Mean",df[f].mean())
291+
summary_df.set_value(i, "Median",df[f].median())
292+
summary_df.set_value(i, "Variance",df[f].var())
293+
summary_df.set_value(i, "S.D",df[f].std())
294+
summary_df.set_value(i, "p5",pd.notnull(df[f]).quantile(0.1))
295+
summary_df.set_value(i, "p10",df[f].dropna(axis=0).quantile(0.1))
296+
summary_df.set_value(i, "p20",df[f].dropna(axis=0).quantile(0.2))
297+
summary_df.set_value(i, "p25",df[f].dropna(axis=0).quantile(0.25))
298+
summary_df.set_value(i, "p30",df[f].dropna(axis=0).quantile(0.3))
299+
summary_df.set_value(i, "p50",df[f].dropna(axis=0).quantile(0.5))
300+
summary_df.set_value(i, "p75",df[f].dropna(axis=0).quantile(0.75))
301+
summary_df.set_value(i, "p80",df[f].dropna(axis=0).quantile(0.8))
302+
summary_df.set_value(i, "p90",df[f].dropna(axis=0).quantile(0.9))
303+
summary_df.set_value(i, "p95",df[f].dropna(axis=0).quantile(0.95))
304+
summary_df.set_value(i, "p97",df[f].dropna(axis=0).quantile(0.97))
305+
summary_df.set_value(i, "p99",df[f].dropna(axis=0).quantile(0.99))
306+
i=i+1;
307+
print(summary_df)
308+
309+
credit_risk=pd.read_csv("E:\\Larning\\hadoop\\Data Science\\001_Python\\Class Files Python\\Class Files Python\\1.Python Programming\\3.Basic Statistics and Reporting in Python\\datasets\\Give me some Credit\\cs-training.csv", encoding = "ISO-8859-1")
310+
311+
allsummary(credit_risk)
312+
313+
###How dropna(axis=0) works
314+
###dropna expects a dataframe as input.
315+
### Axis=1 drops coloumns with NA values
316+
### Axis=0 drops rows with NA values
317+
318+
import numpy as np
319+
df = pd.DataFrame(np.random.randn(5, 3), columns=['one', 'two', 'three'])
320+
df1=df.reindex([0,1,2,3,4,5,6,7])
321+
df1["colfour"]=4
322+
323+
print(df1)
324+
325+
df1[["one","colfour"]]
326+
df1[["one","colfour"]].dropna(axis=0)
327+
328+
df1[["one","colfour"]]
329+
df1[["one","colfour"]].dropna(axis=1)

0 commit comments

Comments
 (0)