In [3]:
#import libraries
import numpy as np
import pandas as pd

## Task 1: Numpy Basics
------

In [21]:
#1.1 NumPy Array Creation

oned = np.arange(1, 21)
twod = np.random.randint(1,101, size=(5,4)) #create an array of ramdom numbers from 1-100

#print values
print("1D array (integers from 1 10 20):")
print(oned)
print("\n2D array (5x4, filled with random numbers)")
print(twod)

1D array (integers from 1 10 20):
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]

2D array (5x4, filled with random numbers)
[[ 1 41 56 95]
 [28 44  6 89]
 [ 6 88 32 84]
 [50 47 18 99]
 [35 75 78 25]]


In [28]:
#1.2 Array Indexing & Slicing

even_oned = oned[oned % 2==0] #extract even numbers from the 1D array
select_twod = twod[[1,3], :] #select the second and fourth rows from the 2D array

#print values
print("Even numbers from 1D array: ")
print(even_oned)
print("\nSecond and Fourth rows from 2D array: ")
print(select_twod)

Even numbers from 1D array: 
[ 2  4  6  8 10 12 14 16 18 20]

Second and Fourth rows from 2D array: 
[[28 44  6 89]
 [50 47 18 99]]


In [32]:
#1.3 Mathematical Operations
array = np.arange(10,101, 10) #create array w/ numbers 1-10 that skip by 10
array_square = np.square(array) #create array squared
array_sqrt = np.sqrt(array) #square root the array 

#print values
print("Original Array:")
print(array)

print("\nArray Squared:")
print(array_square)

print("\nArray Square Root:")
print(array_sqrt)

Original Array:
[ 10  20  30  40  50  60  70  80  90 100]

Array Squared:
[  100   400   900  1600  2500  3600  4900  6400  8100 10000]

Array Square Root:
[ 3.16227766  4.47213595  5.47722558  6.32455532  7.07106781  7.74596669
  8.36660027  8.94427191  9.48683298 10.        ]


In [42]:
#1.4 Statistical Operations
stat_array = np.random.randint(1,51, size=(3,3)) #make array of random integers 1-50
mean_array = np.mean(stat_array) #find mean of array values
medi_array = np.median(stat_array) #find median of array values
std_array= np.std(stat_array) #find  standard mediation of array values

#print values
print("Original Array:")
print(stat_array)

print("\nArray Mean Value:")
print(mean_array)

print("\nArray Median Value:")
print(medi_array)

print("\nArray Standard Deviation:")
print(std_array)

Original Array:
[[ 9 35 23]
 [41 10 38]
 [16 32  9]]

Array Mean Value:
23.666666666666668

Array Median Value:
23.0

Array Standard Deviation:
12.382783747337806


## Task 2: Pandas Basics
---------

### 2.1 Data Loading & Overview

In [61]:
df = pd.read_csv("project1_part2_data.csv") #load data set

In [62]:
print(df.head()) #print first 5 rows

   ID           Name   Age   Salary  Department    Position  \
0   1  Alice Johnson  49.0  46910.0  Operations  Consultant   
1   2  Michael Smith  28.0  40206.0          IT     Manager   
2   3  Emma Williams  30.0  63419.0       Sales     Analyst   
3   4    David Brown  29.0  90636.0          HR   Executive   
4   5   Olivia Jones  33.0      NaN   Marketing  Consultant   

   Work Experience (Years) Education Level  
0                       13             PhD  
1                        9       Associate  
2                       15             PhD  
3                       13      Bachelor's  
4                        1         Diploma  


In [65]:
print(df.info) #print dataset info
print(df.isnull()) #check for missing values

<bound method DataFrame.info of     ID               Name   Age    Salary  Department     Position  \
0    1      Alice Johnson  49.0   46910.0  Operations   Consultant   
1    2      Michael Smith  28.0   40206.0          IT      Manager   
2    3      Emma Williams  30.0   63419.0       Sales      Analyst   
3    4        David Brown  29.0   90636.0          HR    Executive   
4    5       Olivia Jones  33.0       NaN   Marketing   Consultant   
5    6       James Garcia  55.0   94268.0     Finance  Coordinator   
6    7    Sophia Martinez  54.0   58141.0  Operations      Manager   
7    8  Benjamin Anderson  44.0  111910.0   Marketing      Manager   
8    9   Charlotte Thomas  45.0   96044.0       Sales    Executive   
9   10      Daniel Wilson   NaN  107214.0          HR  Coordinator   
10  11      Amelia Taylor  56.0   73827.0  Operations   Consultant   
11  12      Matthew Moore  43.0   95820.0     Finance  Coordinator   
12  13       Isabella Lee  48.0  102623.0          IT     

In [66]:
df_new = df.copy() #create a copy of the original data frame

### 2.2 Data Selection & Filtering

In [76]:
df_new[["Name", "Salary"]] #extract Name and Salary columns

Unnamed: 0,Name,Salary
0,Alice Johnson,46910.0
1,Michael Smith,40206.0
2,Emma Williams,63419.0
3,David Brown,90636.0
4,Olivia Jones,
5,James Garcia,94268.0
6,Sophia Martinez,58141.0
7,Benjamin Anderson,111910.0
8,Charlotte Thomas,96044.0
9,Daniel Wilson,107214.0


In [81]:
#filter the employees who make over $50000 
df_new.loc[df_new["Salary"] > 50000, ["Name", "Salary"]]

Unnamed: 0,Name,Salary
2,Emma Williams,63419.0
3,David Brown,90636.0
5,James Garcia,94268.0
6,Sophia Martinez,58141.0
7,Benjamin Anderson,111910.0
8,Charlotte Thomas,96044.0
9,Daniel Wilson,107214.0
10,Amelia Taylor,73827.0
11,Matthew Moore,95820.0
12,Isabella Lee,102623.0


### 2.3 Data Aggregation


In [71]:
avg_salary = df_new["Salary"].mean() #find avg salary from column

print("Average Salary: ", avg_salary) #print avg salary


Average Salary:  82143.47368421052


In [83]:
#count the number of employees in each department
df_new["Department"].value_counts()

Department
HR            5
Operations    4
Sales         4
Finance       3
IT            2
Marketing     2
Name: count, dtype: int64

### 2.4 Data Cleaning

In [89]:
#replace empty salaries with mean salary
df_new["Salary"].fillna((avg_salary), inplace=True) 
#the only one without the salary was Olivia Jones...

print(df_new)

    ID               Name   Age         Salary  Department     Position  \
0    1      Alice Johnson  49.0   46910.000000  Operations   Consultant   
1    2      Michael Smith  28.0   40206.000000          IT      Manager   
2    3      Emma Williams  30.0   63419.000000       Sales      Analyst   
3    4        David Brown  29.0   90636.000000          HR    Executive   
4    5       Olivia Jones  33.0   82143.473684   Marketing   Consultant   
5    6       James Garcia  55.0   94268.000000     Finance  Coordinator   
6    7    Sophia Martinez  54.0   58141.000000  Operations      Manager   
7    8  Benjamin Anderson  44.0  111910.000000   Marketing      Manager   
8    9   Charlotte Thomas  45.0   96044.000000       Sales    Executive   
9   10      Daniel Wilson   NaN  107214.000000          HR  Coordinator   
10  11      Amelia Taylor  56.0   73827.000000  Operations   Consultant   
11  12      Matthew Moore  43.0   95820.000000     Finance  Coordinator   
12  13       Isabella Lee

In [90]:
#drop rows where age is missing
df_new.dropna(subset=["Age"], inplace=True)
#Daniel Wilson was the onlyone missing age... his row has been removed

#print updated dataset
print(df_new)


    ID               Name   Age         Salary  Department     Position  \
0    1      Alice Johnson  49.0   46910.000000  Operations   Consultant   
1    2      Michael Smith  28.0   40206.000000          IT      Manager   
2    3      Emma Williams  30.0   63419.000000       Sales      Analyst   
3    4        David Brown  29.0   90636.000000          HR    Executive   
4    5       Olivia Jones  33.0   82143.473684   Marketing   Consultant   
5    6       James Garcia  55.0   94268.000000     Finance  Coordinator   
6    7    Sophia Martinez  54.0   58141.000000  Operations      Manager   
7    8  Benjamin Anderson  44.0  111910.000000   Marketing      Manager   
8    9   Charlotte Thomas  45.0   96044.000000       Sales    Executive   
10  11      Amelia Taylor  56.0   73827.000000  Operations   Consultant   
11  12      Matthew Moore  43.0   95820.000000     Finance  Coordinator   
12  13       Isabella Lee  48.0  102623.000000          IT      Manager   
13  14       Ethan Harris

## Task 3: Combined NumPy and Pandas Operations
--------

### 3.1 Coverting a Column to a Numpy Array

In [93]:
#extract column to an array
salary_array = df_new["Salary"].to_numpy

print(salary_array)

<bound method IndexOpsMixin.to_numpy of 0      46910.000000
1      40206.000000
2      63419.000000
3      90636.000000
4      82143.473684
5      94268.000000
6      58141.000000
7     111910.000000
8      96044.000000
10     73827.000000
11     95820.000000
12    102623.000000
13    115450.000000
14     62299.000000
15     83585.000000
16    104044.000000
17     82557.000000
18     89080.000000
19     42693.000000
Name: Salary, dtype: float64>


In [100]:
#change salary array from pandas
salary_array = df_new["Salary"].to_numpy()

#find standart deviation of the salary
std_salary = np.std(salary_array) 

print("The standard deviation is: ", std_salary)

The standard deviation is:  22310.34519866553


### 3.2 New Column Creation


In [102]:
#add new row for salary bonus with 10% increase
df_new["Salary_Bonus"] = df_new["Salary"] * 1.10


print(df_new)

    ID               Name   Age         Salary  Department     Position  \
0    1      Alice Johnson  49.0   46910.000000  Operations   Consultant   
1    2      Michael Smith  28.0   40206.000000          IT      Manager   
2    3      Emma Williams  30.0   63419.000000       Sales      Analyst   
3    4        David Brown  29.0   90636.000000          HR    Executive   
4    5       Olivia Jones  33.0   82143.473684   Marketing   Consultant   
5    6       James Garcia  55.0   94268.000000     Finance  Coordinator   
6    7    Sophia Martinez  54.0   58141.000000  Operations      Manager   
7    8  Benjamin Anderson  44.0  111910.000000   Marketing      Manager   
8    9   Charlotte Thomas  45.0   96044.000000       Sales    Executive   
10  11      Amelia Taylor  56.0   73827.000000  Operations   Consultant   
11  12      Matthew Moore  43.0   95820.000000     Finance  Coordinator   
12  13       Isabella Lee  48.0  102623.000000          IT      Manager   
13  14       Ethan Harris

### 3.2 New Column Creation

In [106]:
#calculate total salary by department
total_salary = df_new.groupby("Department")["Salary"].sum()

print(total_salary)

Department
Finance       294132.000000
HR            311078.000000
IT            142829.000000
Marketing     194053.473684
Operations    262463.000000
Sales         331100.000000
Name: Salary, dtype: float64


In [108]:
#save cleaned file
df_new.to_csv("project1_part2_data_cleaned.csv", index=False)

#### Submission Guidelines
* Firstname_lastname_Proj1_II.ipynb
* firstname_lastname_Proj1_II.html
* project1_part2_data_cleaned.csv
* Ensure your code is well-documented.
##### NOTE: Output for each of the question to be displayed
