In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Import Data Set of Population of Saudi Arab

In [None]:
df=pd.read_csv("/kaggle/input/population-of-saudi-arabia/Population estimates by gender nationality and region 2010 - 2022_data.csv")

## Explore Data Set 

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

We can see that all colums are object data type.So, we convert Year and population Colums to integer.

In [None]:
print(df["Year"].head())
print(df["Population estimates"].head())

## Data Wrangling

Certainly! It appears that there are commas present in the values of the 'Year' and 'Population estimates' columns. To perform the necessary data transformations, we will remove these commas and convert the values to integers.

Here's an improved version of the steps:

**Step 1:** Remove commas from the 'Year' and 'Population estimates' columns using the str.replace() method

In [None]:
df['Year'] = df['Year'].str.replace(',', '').astype(int)
df['Population estimates'] = df['Population estimates'].str.replace(',', '').astype(int)

**Step 2:** Once the commas are removed, convert the 'Year' column to the appropriate numeric type, ensuring it represents a time-based variable:

In [None]:
df['Year'] = pd.to_datetime(df['Year'], format='%Y')

**Step 3:** Now that the 'Year' and 'Population estimates' columns have been converted to numeric types, you can generate summary statistics for the numerical column using the describe() method:

In [None]:
df['Population estimates'].describe()

In [None]:
df.info()

In [None]:
df.columns

## Answer Of Some Questions Which Describe the Overall DataSet

***Question 1:*** What are the different regions included in the dataset, and how many unique regions are there?

In [None]:
print(df['Region'].unique())
df['Region'].nunique()

***Question 2:*** How many years of population data are available in the dataset?

In [None]:
print(df['Year'].unique())
df['Year'].nunique()

***Question 3:*** What is the overall population trend in Saudi Arabia from 2010 to 2022?

In [None]:
df2 = df['Population estimates'].sum()
print("Overall Population = ",df2)

***Question 4:*** Which year had the highest population estimate, and which year had the lowest?

In [None]:
# Find the year with the highest population estimate
year_highest_population = df.loc[df['Population estimates'].idxmax(), 'Year']

# Find the year with the lowest population estimate
year_lowest_population = df.loc[df['Population estimates'].idxmin(), 'Year']

print("Year with the highest population estimate:", year_highest_population)
print("Year with the lowest population estimate:", year_lowest_population)

***Question 5:*** How does the population vary across different regions of Saudi Arabia?
**Ans:** To analyze how the population varies across different regions of Saudi Arabia, you can use various visualization techniques. One effective approach is to create a bar plot or a stacked bar plot to compare the population sizes of different regions

In [None]:
# Group the data by region and calculate the total population for each region
population_by_region = df.groupby('Region')['Population estimates'].sum()

# Create a bar plot to visualize the population across regions
plt.figure(figsize=(12, 6))
population_by_region.plot(kind='bar')
plt.title('Population by Region in Saudi Arabia')
plt.xlabel('Region')
plt.ylabel('Population')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

***Question 6:*** What is the gender distribution within the Saudi Arabian population?

In [None]:
# Group the data by gender and calculate the total population for each gender
population_by_gender = df.groupby('Gender')['Population estimates'].sum()

# Create a pie chart to visualize the gender distribution
plt.figure(figsize=(6, 6))
plt.pie(population_by_gender, labels=population_by_gender.index, autopct='%1.1f%%')
plt.title('Gender Distribution in Saudi Arabian Population')
plt.show()

***Question 7:*** Are there any significant differences in population estimates based on nationality?

In [None]:
# Group the data by nationality and calculate the total population for each nationality
population_by_nationality = df.groupby('Nationality')['Population estimates'].sum()

# Sort the population by nationality in descending order
population_by_nationality = population_by_nationality.sort_values(ascending=False)

# Create a bar plot to visualize the population by nationality
plt.figure(figsize=(12, 6))
population_by_nationality.plot(kind='bar')
plt.title('Population by Nationality in Saudi Arabia')
plt.xlabel('Nationality')
plt.ylabel('Population')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

***Question 8:*** Can we identify any specific trends or patterns in the population data over the years?

In [None]:
# Group the data by year and calculate the average population for each year
population_by_year = df.groupby('Year')['Population estimates'].mean()

# Create a line plot to visualize the population trend over the years
plt.figure(figsize=(10, 6))
plt.plot(population_by_year.index, population_by_year.values, marker='o')
plt.title('Population Trend in Saudi Arabia over the Years')
plt.xlabel('Year')
plt.ylabel('Population')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

***Question 9:*** Are there any outliers or anomalies in the population estimates?

In [None]:
# Calculate the Z-scores for the population estimates
z_scores = (df['Population estimates'] - df['Population estimates'].mean()) / df['Population estimates'].std()

# Create a box plot to visualize the distribution of population estimates
plt.figure(figsize=(8, 6))
plt.boxplot(df['Population estimates'], vert=False)
plt.title('Population Estimates - Box Plot')
plt.xlabel('Population')
plt.show()

# Identify potential outliers based on Z-scores
outliers = df[np.abs(z_scores) > 3]
print("Potential outliers:")
print(outliers)

***Question 10:*** Is there any correlation between population estimates and other variables,  region, gender, or nationality

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df['Region'], df['Population estimates'])
plt.title('Population Estimates by Region')
plt.xlabel('Region')
plt.ylabel('Population')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(3, 6))
plt.scatter(df['Gender'], df['Population estimates'])
plt.title('Population Estimates by Region')
plt.xlabel('Gender')
plt.ylabel('Population')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(3, 6))
plt.scatter(df['Nationality'], df['Population estimates'])
plt.title('Population Estimates by Region')
plt.xlabel('Nationality')
plt.ylabel('Population')
plt.xticks(rotation=45)
plt.show()

## Some Of the Interactive Graph Using Plotly

### Overall Graph Using plotly

In [None]:
fig = px.scatter(df, x="Region", y="Year",
	         size="Population estimates", color="Year",hover_name="Gender",
                   size_max=60, title='Population of Saudia Arab')
fig.show()

In [None]:
fig = px.pie(df, values='Population estimates', names='Region', title='Region Base Population of Saudia Arab')
fig.show()

In [None]:
fig = px.pie(df, values='Population estimates', names='Year', title='Year Base Population of Saudia Arab')
fig.show()