# __Pandas DataFrame__

In [2]:
import pandas as pd

dir(pd)

['ArrowDtype',
 'BooleanDtype',
 'Categorical',
 'CategoricalDtype',
 'CategoricalIndex',
 'DataFrame',
 'DateOffset',
 'DatetimeIndex',
 'DatetimeTZDtype',
 'ExcelFile',
 'ExcelWriter',
 'Flags',
 'Float32Dtype',
 'Float64Dtype',
 'Grouper',
 'HDFStore',
 'Index',
 'IndexSlice',
 'Int16Dtype',
 'Int32Dtype',
 'Int64Dtype',
 'Int8Dtype',
 'Interval',
 'IntervalDtype',
 'IntervalIndex',
 'MultiIndex',
 'NA',
 'NaT',
 'NamedAgg',
 'Period',
 'PeriodDtype',
 'PeriodIndex',
 'RangeIndex',
 'Series',
 'SparseDtype',
 'StringDtype',
 'Timedelta',
 'TimedeltaIndex',
 'Timestamp',
 'UInt16Dtype',
 'UInt32Dtype',
 'UInt64Dtype',
 'UInt8Dtype',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__docformat__',
 '__file__',
 '__git_version__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_built_with_meson',
 '_config',
 '_is_numpy_dev',
 '_libs',
 '_pandas_datetime_CAPI',
 '_pandas_parser_CAPI',
 '_testing',
 '_typing',
 '_version_meson',
 'annota

## __Agenda__

- Introduction to Pandas DataFrame
  * Creating a DataFrame Using Different Methods
  * Accessing the DataFrame
  * Understanding DataFrame Basics
- Introduction to Statistical Operations in Pandas
  * Descriptive Statistics
  * Mean, Median, and Standard Deviation
  * Correlation Analysis

## __1. Introduction to Pandas DataFrame__

A Pandas DataFrame is a two-dimensional, tabular data structure with labeled axes (rows and columns).

It is a primary data structure in the Pandas library, providing a versatile and efficient way to handle and manipulate data in Python.

![link text](https://labcontent.simplicdn.net/data-content/content-assets/Data_and_AI/ADSP_Images/Lesson_04_Working_with_Pandas/2_Introduction_to_DataFrame/Introduction_to_Pandas_DataFrame.png)

### __Key Features:__
- __Tabular structure:__ The DataFrame is organized as a table with rows and columns, similar to a spreadsheet or SQL table.

- __Labeled axes:__ Both rows and columns are labeled, allowing for easy indexing and referencing of data.

- __Heterogeneous data types:__ Each column in a DataFrame can contain different types of data, such as integers, floats, strings, or even complex objects.

- __Versatility:__ DataFrames can store and handle a wide range of data formats, including CSV, Excel, SQL databases, and more.

- __Data alignment:__ Operations on DataFrames are designed to handle missing values gracefully, aligning data based on labels.

### __1.1 Creating a DataFrame Using Different Methods__
Creating a Pandas DataFrame is a fundamental step in data analysis and manipulation.
- Diverse methods are available within Pandas to generate a DataFrame, addressing various data sources and structures.
- Data, whether in Python dictionaries, lists, NumPy arrays, or external files such as CSV and Excel, can be seamlessly transformed into a structured tabular format by Pandas.

In [4]:
import pandas as pd

# Creating a DataFrame from a dictionary
data_dict = {'Name': ['Alice', 'Bob', 'Charlie'],
             'Age': [25, 30, 22],
             'Salary': [50000, 60000, 45000]}

df_dict = pd.DataFrame(data_dict)
print(df_dict)


      Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   22   45000


In [5]:
# Creating a DataFrame from lists
data_list = [['Ali', 56, 50000], ['Bobby', 30, 60000], ['Charly', 22, 45000]]

# Defining column names
columns = ['Name', 'Age', 'Salary']

df_list = pd.DataFrame(data_list, columns=columns, index=['r1','r2','r3']) # Adding index labels #  r1, r2, r3
print(df_list)

      Name  Age  Salary
r1     Ali   56   50000
r2   Bobby   30   60000
r3  Charly   22   45000


In [6]:
# Creating a DataFrame from a NumPy array
import numpy as np
data_array = np.array([['Alice', 25, 50000],
                       ['Bob', 30, 60000],
                       ['Charlie', 22, 45000]])

df_array = pd.DataFrame(data_array, columns=columns)
print(df_array)

      Name Age Salary
0    Alice  25  50000
1      Bob  30  60000
2  Charlie  22  45000


In [7]:
# Creating a DataFrame from a CSV file
df_csv = pd.read_csv('HousePrices.csv')
df_csv.head(10)



Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA
5,2014-05-02 00:00:00,490000.0,2.0,1.0,880,6380,1.0,0,0,3,880,0,1938,1994,522 NE 88th St,Seattle,WA 98115,USA
6,2014-05-02 00:00:00,335000.0,2.0,2.0,1350,2560,1.0,0,0,3,1350,0,1976,0,2616 174th Ave NE,Redmond,WA 98052,USA
7,2014-05-02 00:00:00,482000.0,4.0,2.5,2710,35868,2.0,0,0,3,2710,0,1989,0,23762 SE 253rd Pl,Maple Valley,WA 98038,USA
8,2014-05-02 00:00:00,452500.0,3.0,2.5,2430,88426,1.0,0,0,4,1570,860,1985,0,46611-46625 SE 129th St,North Bend,WA 98045,USA
9,2014-05-02 00:00:00,640000.0,4.0,2.0,1520,6200,1.5,0,0,3,1520,0,1945,2010,6811 55th Ave NE,Seattle,WA 98115,USA


In [12]:
df_csv.head(3).T # Transpose the DataFrame to see the first 3 rows as columns


Unnamed: 0,0,1,2
date,2014-05-02 00:00:00,2014-05-02 00:00:00,2014-05-02 00:00:00
price,313000.0,2384000.0,342000.0
bedrooms,3.0,5.0,3.0
bathrooms,1.5,2.5,2.0
sqft_living,1340,3650,1930
sqft_lot,7912,9050,11947
floors,1.5,2.0,1.0
waterfront,0,0,0
view,0,4,0
condition,3,5,4


In [16]:
# Creating a DataFrame from an Excel file
import pandas as pd
df_excel = pd.read_excel('Iris.xlsx')
df_excel.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


pip install openpyxl # to open the Excel need to install library openpyxl

In [17]:
df_excel.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


### __1.2 Accessing the DataFrame__

Accessing a Pandas DataFrame involves employing various methods for selecting and retrieving data, whether it be specific columns, rows, or individual cells.
- Utilizing square brackets, iloc and loc indexers, and conditions, analysts can navigate and extract the necessary information from the DataFrame for further analysis and manipulation.
- The flexibility of Pandas allows for both label-based and position-based indexing, offering a versatile toolkit for accessing and working with data efficiently.

In [36]:
import pandas as pd

# Creating a sample DataFrame
data = {'Column_name': [5, 15, 8],
        'Column1': [10, 20, 30],
        'Column2': [100, 200, 300],
        'Another_column': [25, 35, 45]}

df = pd.DataFrame(data) # Creating DataFrame from dictionary
print(df) # Printing the DataFrame 

# Accessing a single column
column_data = df['Column_name']
print("Single column:")
print(column_data)
print(df['Another_column'])

   Column_name  Column1  Column2  Another_column
0            5       10      100              25
1           15       20      200              35
2            8       30      300              45
Single column:
0     5
1    15
2     8
Name: Column_name, dtype: int64
0    25
1    35
2    45
Name: Another_column, dtype: int64


In [37]:

# Accessing multiple columns
selected_columns = df[['Column1', 'Column2']]
print("\nMultiple columns:")
print(selected_columns)


Multiple columns:
   Column1  Column2
0       10      100
1       20      200
2       30      300


In [38]:
print(df)
# Accessing a specific row by index
row_data = df.iloc[0]
print("\nSpecific row:")
print(row_data)

   Column_name  Column1  Column2  Another_column
0            5       10      100              25
1           15       20      200              35
2            8       30      300              45

Specific row:
Column_name         5
Column1            10
Column2           100
Another_column     25
Name: 0, dtype: int64


In [None]:
# Accessing rows based on a condition
filtered_rows = df[df['Column_name'] > 10] # Filtering rows where 'Column_name' is greater than 10
print("\nFiltered rows:")
print(filtered_rows)

multiply = df['Column1'] * 2 # Multiplying 'Column1' by 2
print("\nMultiply Column1 by 2:" + str(multiply))


Multiply Column1 by 2:0    20
1    40
2    60
Name: Column1, dtype: int64

Filtered rows:
   Column_name  Column1  Column2  Another_column
1           15       20      200              35


In [36]:
df.loc[:,['Column1', 'Column2']]

Unnamed: 0,Column1,Column2
0,10,100
1,20,200
2,30,300


In [42]:
# Accessing a single cell by label
value = df.loc[0, 'Column_name']
print("\nSingle cell by label:")
print(value)


Single cell by label:
5


In [23]:
# Accessing a single cell by position
value = df.iat[0, 1]  # Row 0, Column 1
print("\nSingle cell by position:")
print(value)

# Accessing data using .loc
selected_data = df.loc[0, 'Column_name']
print("\nData using .loc:")
print(selected_data)

# Conditional access
selected_data = df[df['Column_name'] > 10]['Another_column']
print("\nConditional access:")
print(selected_data)



Single cell by position:
10

Data using .loc:
5

Conditional access:
1    35
Name: Another_column, dtype: int64


### __1.3 Understanding DataFrame Basics__
- The head() and tail() methods enable users to efficiently preview the initial and final rows of a DataFrame, offering a quick snapshot of its structure and content.
- These functions are invaluable for a preliminary assessment of column names, data types, and potential issues. Additionally, the info() method provides a comprehensive summary, detailing data types, non-null counts, and memory usage, aiding in the identification of missing or inconsistent data.
- The shape attribute, on the other hand, succinctly communicates the dimensions of the DataFrame, encapsulating the number of rows and columns.
- The syntax for some functions is provided below:

![link text](https://labcontent.simplicdn.net/data-content/content-assets/Data_and_AI/ADSP_Images/Lesson_04_Working_with_Pandas/2_Introduction_to_DataFrame/Understanding_DataFrame_Basics.png)

In [44]:
import pandas as pd

# Create a sample DataFrame
data = {'Column_name': [5, 15, 8],
        'Column1': [10, 20, 30],
        'Column2': [100, 200, 300],
        'Another_column': [25, 35, 45]}

df = pd.DataFrame(data)
print(df)
# Display the first 2 rows
print("First 2 rows:")
print(df.head(2))

# Display the last row
print("\nLast row:")
print(df.tail(1))

# Provide a comprehensive summary of the DataFrame
print("\nDataFrame summary:")
df.info()

# Return a tuple representing the dimensions of the DataFrame (Rows, columns)
print("\nDataFrame dimensions:")
print(df.shape)


   Column_name  Column1  Column2  Another_column
0            5       10      100              25
1           15       20      200              35
2            8       30      300              45
First 2 rows:
   Column_name  Column1  Column2  Another_column
0            5       10      100              25
1           15       20      200              35

Last row:
   Column_name  Column1  Column2  Another_column
2            8       30      300              45

DataFrame summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Column_name     3 non-null      int64
 1   Column1         3 non-null      int64
 2   Column2         3 non-null      int64
 3   Another_column  3 non-null      int64
dtypes: int64(4)
memory usage: 228.0 bytes

DataFrame dimensions:
(3, 4)


## __2. Introduction to Statistical Operations in Pandas__
Pandas supports the computation of fundamental measures such as mean and median, along with the exploration of correlations and distribution characteristics.

The following examples illustrate key statistical operations available in Pandas:

### __2.1 Descriptive Statistics__
It offers a snapshot of the dataset's central tendencies and dispersions.

The describe() function provides a quick summary, including mean, standard deviation, and quartile information.

In [46]:
import pandas as pd

# Create a sample DataFrame with numeric columns
data = {'Numeric_column1': [5, 15, 8],
        'Numeric_column2': [10, 20, 30],
        'Numeric_column3': [100, 200, 300]}

df = pd.DataFrame(data)
print(df)

# Display descriptive statistics for numeric columns
print("Descriptive statistics for numeric columns:")
print(df.describe())


   Numeric_column1  Numeric_column2  Numeric_column3
0                5               10              100
1               15               20              200
2                8               30              300
Descriptive statistics for numeric columns:
       Numeric_column1  Numeric_column2  Numeric_column3
count         3.000000              3.0              3.0
mean          9.333333             20.0            200.0
std           5.131601             10.0            100.0
min           5.000000             10.0            100.0
25%           6.500000             15.0            150.0
50%           8.000000             20.0            200.0
75%          11.500000             25.0            250.0
max          15.000000             30.0            300.0


### __2.1 Mean, Median, and Standard Deviation__

In [50]:
import pandas as pd

# Create a sample DataFrame with numeric columns
data = {'Numeric_column1': [5, 15, 8],
        'Numeric_column2': [10, 20, 30],
        'Numeric_column3': [100, 200, 300]}

df = pd.DataFrame(data)

# Calculate mean, median, and standard deviation
mean_value = df['Numeric_column1'].mean()
median_value = df.median()
std_deviation = df.std()

print("Mean:\n", mean_value)
print("\nMedian:\n", median_value)
print("\nStandard deviation:\n", std_deviation)


Mean:
 9.333333333333334

Median:
 Numeric_column1      8.0
Numeric_column2     20.0
Numeric_column3    200.0
dtype: float64

Standard deviation:
 Numeric_column1      5.131601
Numeric_column2     10.000000
Numeric_column3    100.000000
dtype: float64


### __2.2 Correlation Analysis__
The corr() function generates a correlation matrix, indicating how variables relate to each other.

Values closer to 1 or -1 imply a stronger correlation, while values near 0 suggest a weaker correlation.

In [25]:
import pandas as pd

# Create a sample DataFrame with numeric columns
data = {'Numeric_column1': [5, 15, 8],
        'Numeric_column2': [10, 20, 30],
        'Numeric_column3': [100, 200, 300]}

df = pd.DataFrame(data)

# Compute correlation matrix
correlation_matrix = df.corr()

print("Correlation matrix:\n", correlation_matrix)


Correlation matrix:
                  Numeric_column1  Numeric_column2  Numeric_column3
Numeric_column1         1.000000         0.292306         0.292306
Numeric_column2         0.292306         1.000000         1.000000
Numeric_column3         0.292306         1.000000         1.000000


#### __Value Counts__
The value_counts() function tallies the occurrences of unique values in a categorical column, aiding in understanding the distribution of categorical data.

In [28]:
import pandas as pd

# Create a sample DataFrame with a category column
data = {'Category': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'A', 'B', 'C']}
df = pd.DataFrame(data)
print(df)

# Count occurrences of unique values in the category column
value_counts = df['Category'].value_counts()

print("Value counts:\n", value_counts)


  Category
0        A
1        B
2        A
3        C
4        B
5        A
6        C
7        A
8        B
9        C
Value counts:
 Category
A    4
B    3
C    3
Name: count, dtype: int64


In [29]:
count_A = (df['Category'] == 'A').sum()
print(count_A)

4


In [30]:
value_counts1 = df['Category'].value_counts().loc['A']
value_counts1

4

In [31]:
df.value_counts?

[31mSignature:[39m
df.value_counts(
    subset: [33m'IndexLabel | None'[39m = [38;5;28;01mNone[39;00m,
    normalize: [33m'bool'[39m = [38;5;28;01mFalse[39;00m,
    sort: [33m'bool'[39m = [38;5;28;01mTrue[39;00m,
    ascending: [33m'bool'[39m = [38;5;28;01mFalse[39;00m,
    dropna: [33m'bool'[39m = [38;5;28;01mTrue[39;00m,
) -> [33m'Series'[39m
[31mDocstring:[39m
Return a Series containing the frequency of each distinct row in the Dataframe.

Parameters
----------
subset : label or list of labels, optional
    Columns to use when counting unique combinations.
normalize : bool, default False
    Return proportions rather than frequencies.
sort : bool, default True
    Sort by frequencies when True. Sort by DataFrame column values when False.
ascending : bool, default False
    Sort in ascending order.
dropna : bool, default True
    Don't include counts of rows that contain NA values.

    .. versionadded:: 1.3.0

Returns
-------
Series

See Also
--------
Serie

# __Assisted Practice__

## __Problem Statement:__
Analyze a housing dataset using Pandas DataFrame and statistical operations to understand the basic characteristics of the data and the relationships between different variables.

## __Steps to Perform:__
- Load the housing dataset into a Pandas DataFrame
- Familiarize with the DataFrame basics such as its structure, data types of the columns, and summary statistics
- Calculate descriptive statistics like mean, median, and standard deviation for numerical columns such as __LotArea__, __YearBuilt__, __1stFlrSF__, __2ndFlrSF__, and __SalePrice__
- Determine the correlation between different numerical variables such as __LotArea__ and __SalePrice__, __YearBuilt__ and __SalePrice__, __1stFlrSF__ and __SalePrice__, and __2ndFlrSF__ and __SalePrice__
- Count the number of occurrences of each category in categorical variables such as __Neighborhood__, __BldgType__, and __HouseStyle__

In [18]:
import pandas as pd

hp=pd.read_csv("housing_data.csv")
hp

Unnamed: 0.1,Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0,SC60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,No,No,No,0,Feb,2008,WD,Normal,208500
1,1,SC20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,No,No,No,0,May,2007,WD,Normal,181500
2,2,SC60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,No,No,No,0,Sep,2008,WD,Normal,223500
3,3,SC70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,No,No,No,0,Feb,2006,WD,Abnorml,140000
4,4,SC60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,No,No,No,0,Dec,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1455,SC60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,No,No,No,0,Aug,2007,WD,Normal,175000
1456,1456,SC20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,No,MnPrv,No,0,Feb,2010,WD,Normal,210000
1457,1457,SC70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,No,GdPrv,Shed,2500,May,2010,WD,Normal,266500
1458,1458,SC20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,No,No,No,0,Apr,2010,WD,Normal,142125


In [20]:
hp.shape

(1460, 81)

In [22]:
hp.dtypes

Unnamed: 0         int64
MSSubClass        object
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold            object
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [24]:
hp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     1460 non-null   int64  
 1   MSSubClass     1460 non-null   object 
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [26]:
hp.describe()

Unnamed: 0.1,Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,YrSold,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,729.5,57.623288,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.117123,443.639726,46.549315,...,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,2007.815753,180921.19589
std,421.610009,34.664304,9981.264932,1.382997,1.112799,30.202904,20.645407,180.731373,456.098091,161.319273,...,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,1.328095,79442.502883
min,0.0,0.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2006.0,34900.0
25%,364.75,42.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2007.0,129975.0
50%,729.5,63.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,2008.0,163000.0
75%,1094.25,79.0,11601.5,7.0,6.0,2000.0,2004.0,164.25,712.25,0.0,...,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,2009.0,214000.0
max,1459.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,2010.0,755000.0


In [30]:
hp['LotArea'].describe()

count      1460.000000
mean      10516.828082
std        9981.264932
min        1300.000000
25%        7553.500000
50%        9478.500000
75%       11601.500000
max      215245.000000
Name: LotArea, dtype: float64

In [32]:
value_count3= hp["HouseStyle"].value_counts()
value_count3

HouseStyle
1Story    726
2Story    445
1.5Fin    154
SLvl       65
SFoyer     37
1.5Unf     14
2.5Unf     11
2.5Fin      8
Name: count, dtype: int64

In [36]:
numerical_cols4 = ["SalePrice","2ndFlrSF"]
correlations4 = hp[numerical_cols4].corr()
correlations4

Unnamed: 0,SalePrice,2ndFlrSF
SalePrice,1.0,0.319334
2ndFlrSF,0.319334,1.0


In [38]:
hp['SalePrice'].corr(hp['2ndFlrSF'])

0.3193338028320681