In [None]:
## Create a Dataset

import numpy as np
import matplotlib.pyplot as plt


## Let's generate the synthetic dataset of student scores out of 100

## np.random.seed(42) # Reproducibility - Old way
np.random.default_rng(42) ## Reproducibility - New way

scores = np.random.normal(loc=70, scale=10, size=25) ## mean=70, std=10, total generated count = 200

scores = np.clip(scores, 0,100) ## Ensure scores stays between the range of  0 - 100

scores = np.rint(scores).astype(int)  # Round and convert to int

print(scores)


In [None]:
mean_score=np.mean(scores)

median_score = np.median(scores)

#mod_score = np.mod(scores)

variance_score = np.var(scores)

std_dev_score = np.std(scores)

min_score = np.min(scores)

max_score = np.max(scores)

percentiles = np.percentile(scores, [25, 50, 75])

print("Mean:", mean_score)
print("Median:", median_score)
#print("Mod:", mod_score)
print("Variance:", variance_score)
print("Standard Deviation:", std_dev_score)
print("Min:", min_score)
print("Max:", max_score)
print("25th, 50th, 75th Percentiles:", percentiles)


In [None]:
print("Scores:", scores)

print("Sorted_scores:", np.sort(scores))

plt.hist(scores, bins='auto', color='skyblue', edgecolor='black')

plt.title("Disctribution of students score [0 - 100]")
plt.xlabel("scores")
plt.ylabel("Frequency")
plt.show()

# Relative histogram.
plt.hist(scores, bins='auto', density=True);
plt.show()

plt.hist(scores, bins='auto', cumulative=True, density=True);
plt.show()

In [None]:
plt.boxplot(scores, vert=False)
plt.title("Box plots for students score")
plt.xlabel("Scores")
plt.show()

plt.boxplot(scores, showmeans=True);

plt.show()



In [None]:
# Probability and random sampling:

sample = np.random.choice(scores, size=5, replace=False)

print("Sample mean: ", np.mean(sample))
print("Sample standard deveitaion:", np.std(sample))



In [None]:
# Correlation Example

# Generate a dataset (study hours vs. exam scores).

study_hours = np.random.normal(loc=5, scale=2, size=25)
study_hours = np.clip(study_hours, 0, None)

# Assuming scores are correlated with study hours

exam_scores = 50 + (study_hours * 8) + np.random.normal(0, 5, 25)
exam_scores = np.clip(exam_scores, 0, 100)

#Correlation

corr = np.corrcoef(study_hours, exam_scores)[0,1]

print("Correlation between study hours and exam scores", corr)

# Plot the scatterplot
plt.scatter(study_hours, exam_scores, alpha=0.6, color='green')
plt.xlabel("Study Hours")
plt.ylabel("Exam scores")
plt.show()

plt.boxplot((exam_scores, study_hours), showmeans=True, patch_artist=True);


In [None]:
# violin plot!

plt.violinplot(scores)

In [None]:
# Bar chart and pie chart

cancer = ['Lung', 'Breast', 'Colon', 'Prostate', 'Melanoma', 'Bladder']
numbers = [42, 50, 32, 55, 9, 12]

cancer_dict = dict(zip(cancer,numbers))

print(cancer_dict)

cancer_type = list(cancer_dict.keys())
cancer_type_numbers = list(cancer_dict.values())

print(cancer_type)
print(cancer_type_numbers)


plt.pie(cancer_type_numbers, labels=cancer_type, shadow=True)
plt.show()

x = np.arange(len(numbers))
plt.bar(x, numbers, tick_label=cancer)
plt.grid();
plt.show()


In [None]:
sachins_data = np.loadtxt("data/sachin.txt")
kohalis_data = np.loadtxt("data/kohli.txt")

print(sachins_data[:10])
print(kohalis_data[:10])

plt.hist(sachins_data)
plt.show()

plt.hist(kohalis_data)
plt.show()

plt.hist((sachins_data, kohalis_data), label=('Sachin', 'Kohli'))
#plt.hist((kohalis_data), label='Kohli')
plt.xlabel("Sachin")
plt.ylabel("Kohli")
plt.legend()
plt.show()

plt.boxplot((sachins_data, kohalis_data), vert=False, patch_artist=True, labels=('Sachin', 'Kohli'))

plt.show()


In [None]:
print(len(sachins_data), max(sachins_data))

In [None]:
plt.hist(sachins_data, bins='auto', density=True)
plt.show()

In [None]:
c, b, p = plt.hist(np.random.randint(0, 2, size=10000), bins=2, density=True, edgecolor='black');

In [None]:
plt.hist(sachins_data, bins=range(201), density=True);

In [None]:
pmf, bins = np.histogram(sachins_data, bins=range(201), density=True)
print(pmf)
print(bins)