# Read a dataframe for csv file


In [0]:
df = spark.read.csv('/Volumes/learning/data/raw/Life Expectancy Data.csv', header=True, inferSchema=True)

In [0]:

print(df.columns)

# Cleaning the column names

# Viewing Columnames using Columns method


In [0]:
print(df.columns)

# Viewing first 5 rows



# Checking data types of columns

In [0]:
df.dtypes

# Counting number of rows and columns presented in dataframe

In [0]:
row1 = df.count()
col1 = len(df.columns)
print(f'Dimension of the Dataframe is: {(col1,row1)}')

#Column renaming, saving the immutable dataframe as a pandas dataframe, then utilizing the get and to data frame methods to transform the pandas dataframe into a spark data frame

In [0]:
mapping = {'under-five deaths':'Under 5yr Deaths','HIV/AIDS':'HIV AIDS','thinness  1-19 years':'thinness  1 to 19 years','thinness 5-9 years':'thinness 5 to 9 years'}
new_names = [mapping.get(col,col) for col in df.columns] 
pandas_df=df.toDF(*new_names).toPandas()
df1=spark.createDataFrame(pandas_df) 
df1.show(5)

In [0]:
from functools import reduce

oldColumns = df.schema.names
newColumns = ["Country","Year","Status","LifeExpectancy","AdultMortality","InfantDeaths","Alcohol","PercentageExpenditure","HepatitisB","Measles","BMI", "Under5yrDeaths","Polio","TotalExpenditure","Diphtheria","HIV_or_AIDS","GDP", "Population", "Thinness1to19years", "Thinness5to9years","IncomeCompositionOfResources", "Schooling"]

df = reduce(lambda df, idx: df.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), df)
df.printSchema()
df.show(5)

In [0]:
# sqlContext.registerDataFrameAsTable(df, "myTable")
# df2 = sqlContext.sql("SELECT 'under-five deaths' AS 'Under 5yr Deaths', 'HIV/AIDS' as 'HIV AIDS', 'thinness  1-19 years' as 'thinness  1 to 19 years', 'thinness 5-9 years' as 'thinness 5 to 9 years'  from myTable")

# df2.show(5)

In [0]:
# To rename column names in a Spark DataFrame, you can use the withColumnRenamed method. Below are three approaches to achieve this:
# 2. Rename Multiple Columns (Chaining withColumnRenamed)

# Rename multiple columns by chaining
# df_renamed = df.withColumnRenamed("under-five deaths", "Under 5yr Deaths").withColumnRenamed("HIV/AIDS", "HIV AIDS").withColumnRenamed("thinness  1-19 years", "thinness  1 to 19 years").withColumnRenamed("thinness 5-9 years", "thinness 5 to 9 years")
# df_renamed.show(5)


# Save spark dataframe as table in databricks

In [0]:
df.write.saveAsTable("learning.data.life_expectancy")

# Python SQL QUERY

In [0]:
%sql 
select * from learning.data.life_expectancy limit 10

In [0]:
import pandas as pd 

life_ex_df = pd.read_csv('/Volumes/learning/data/raw/Life Expectancy Data.csv')
life_ex_df.head()

In [0]:
life_ex_df.describe()


In [0]:
sum_life_ex= life_ex_df[['Life expectancy ','GDP']].sum(axis=0)
print(sum_life_ex)




In [0]:
avg_life_ex = life_ex_df[['Life expectancy ','GDP']].mean(axis=0)
print(avg_life_ex)

In [0]:
life_ex_df.head(10)

In [0]:
life_ex_df.info()

In [0]:
import pandas as pd 
import datetime as dt 

life_ex_df['year']=pd.to_datetime(life_ex_df['Year'])
today = dt.date.today()
print(today.year)


year_dif = today.year - life_ex_df['year'].dt.year
print(year_dif.head())
print(year_dif.min(axis=0))
print(year_dif.max(axis=0))

In [0]:
# Count missing values in each column
life_ex_df.isnull().sum()

# Univariate Analysis #

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Check the column names in the DataFrame
# print(life_ex_df.columns)

plt.style.use('seaborn-v0_8-whitegrid')
plt.figure(figsize=(12, 6))
sns.histplot(life_ex_df['Life expectancy '], kde=True, bins=30, color='dodgerblue')
plt.title('Distribution of Life Expectancy', fontsize=16)
plt.xlabel('Life Expectancy (Years)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()

# Bivariate Analysis#

In [0]:



plt.figure(figsize=(20, 16))
correlation_matrix = life_ex_df.corr(numeric_only=True)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Features', fontsize=20)
plt.show()

In [0]:
%sql 

select * from learning.data.life_expectancy