In [1]:
import pandas as pd
url = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/villagers.csv"
df = pd.read_csv(url)
df.isna().sum()

row_n           0
id              1
name            0
gender          0
species         0
birthday        0
personality     0
song           11
phrase          0
full_id         0
url             0
dtype: int64

In [2]:
rows, columns = df.shape
print(f'Number of rows: {rows}')
print(f'Number of columns: {columns}')

Number of rows: 391
Number of columns: 11


In [None]:
# difference between observation and variable

#definition of an observation: An observation is the item or entry that is given in the dataset that the attributes relate to. In relevance to the dataset, each individual villager is its own observation, as they all have their own qualities and are analyzed differently.
#definition of a variable: A variable is an attribute or quality about an observation which records different information about each specific observation. Relating to the dataset, the name or species of every single observation is a variable.

In [11]:
numerical_summary = df.describe()
print("Summary of numerical columns:")
print(numerical_summary)

categorical_summary = ['species', 'personality']  
print("\nSummary of categorical columns:")

species_counts = df['species'].value_counts()
print(species_counts)

Summary of numerical columns:
            row_n
count  391.000000
mean   239.902813
std    140.702672
min      2.000000
25%    117.500000
50%    240.000000
75%    363.500000
max    483.000000

Summary of categorical columns:
species
cat          23
rabbit       20
frog         18
squirrel     18
duck         17
dog          16
cub          16
pig          15
bear         15
mouse        15
horse        15
bird         13
penguin      13
sheep        13
elephant     11
wolf         11
ostrich      10
deer         10
eagle         9
gorilla       9
chicken       9
koala         9
goat          8
hamster       8
kangaroo      8
monkey        8
anteater      7
hippo         7
tiger         7
alligator     7
lion          7
bull          6
rhino         6
cow           4
octopus       3
Name: count, dtype: int64


In [17]:
missing_values = df.select_dtypes(include='number').isnull().sum()
print("Missing values in numeric columns:")
print(missing_values)

Missing values in numeric columns:
row_n    0
dtype: int64


In [18]:
import pandas as pd
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv"
df = pd.read_csv(url)
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [None]:
# difference between df.shape and df.describe()

#df.shape returns the total number of rows and columns in the dataset, including all columns regardless of their data type. In contrast, df.describe() provides summary statistics only for numeric columns, excluding non-numeric columns from its analysis.
#df.describe() reflects the number of non-null entries for each numeric column, indicating the presence of missing values, whereas df.shape does not account for missing values but shows the total number of rows.

In [None]:
# difference between attribute and method

#An attribute is a characteristic/feature of an object that can be accessed directly without performing any operations or computations, and it does not require parentheses. For example, df.shape reveals the dimensions of the dataframe, including the number of rows and columns.
#A method is a function linked to an object that carries out an action or calculation. It requires parentheses which might contain arguments to be executed. For example, df.describe() calculates and provides summary statistics for the numeric columns in the dataframe.

In [None]:
#definitions of stat terms

#count: The number of non-null values in a colum, which represents how many data points are present for that column.
#mean: The average value of the data points in a column, calculated by adding all the values and dividing by the count.
#standard deviation: A measure of the dispersion or spread of the data points around the mean, indicating how much the values vary from the average.
#minimum: The smallest value in the column, showing the lower end of the data range.
#25% (Q1): The value below which 25% of the data points fall, marking the 25th percentile of the data.
#50% (median or Q2): The middle value of the column when the data is sorted, dividing the data into two equal halves.
#75% (Q3): The value below which 75% of the data points fall, marking the 75th percentile of the data.
#maximum: The largest value in the column, showing the upper end of the data range.

In [27]:
7.

# 1) If you're analyzing survey responses and a few respondents have left some answers blank, df.dropna() allows you to retain as much data as possible by removing only those specific incomplete responses. Using df.dropna() is better in this case if you want to remove rows with missing values while keeping all columns.

# 2) For instance, if a column about secondary phone numbers is mostly empty and irrelevant to your core analysis, deleting it with del df['col'] helps in focusing on the more relevant data.

# 3) Applying del df['col'] before df.dropna() is important because it enhances efficiency by reducing the number of columns df.dropna() needs to process and ensures that you focus on retaining valuable rows with minimal missing values, while removing irrelevant or excessively incomplete columns first and keeps the useful information.

# 4)

df = df.drop(columns=['age'])

df = df.dropna()

print("\nAfter cleaning:")
print(df.shape)
print(df.isnull().sum())

#The 'age' column was removed because it had a significant number of missing values, which might not be useful for the analysis and could complicate data cleaning. After removing columns with excessive missing values, df.dropna() was used to remove rows with any remaining missing data, ensuring that the dataset contains only complete cases.


After cleaning:
(712, 13)
survived       0
pclass         0
sex            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64


In [None]:
# first chatgpt link = "https://chatgpt.com/share/66e3b1ab-8480-800a-bc90-cedfe25dd56b"
#Summary of Exchanges
#Dataset Overview:

#User downloaded the villagers.csv dataset related to Animal Crossing and sought to understand the structure of the dataset.
#We discussed how to determine the number of rows and columns in the dataset using pandas in Python.
#Observations and Variables:

#Explained the concepts of observations (rows) and variables (columns) in the context of data analysis.
#Provided Python code to explore these concepts using the villagers.csv dataset.
#Column Summaries:

#Provided methods to obtain a simple summary of each column, both numerically and categorically, using pandas.
#Demonstrated how to use df.describe() for numerical columns and value_counts() for categorical columns.
#Difference Between df.describe() and df.shape:

#Explained the difference between df.describe() and df.shape:
#df.describe(): Provides statistical summaries for numerical columns and categorical columns if specified.
#df.shape: Returns the dimensions of the DataFrame (number of rows and columns).
#Used the Titanic dataset (titanic.csv) as an example to illustrate these concepts.
#Attributes vs. Methods in Python:

#Differentiated between attributes and methods in Python:
#Attributes: Variables that store data about an object.
#Methods: Functions that define the behaviors or actions of an object.
#Provided examples to illustrate how attributes and methods are used in Python classes.


In [30]:
# 8.

# 1) 
import pandas as pd

url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/villagers.csv'
df_villagers = pd.read_csv(url)

print("Columns in Villagers dataset:")
print(df.columns)
print(df.head())
age_description_by_species = df.groupby('species')['age'].describe()

print("\nDescriptive statistics for 'age' by 'species':")
print(age_description_by_species)

# 2)
#The "df.describe()" method provides a count of non-null values for each column across the entire dataframe, reflecting overall data completeness. In contrast, "df.groupby("col1")["col2"].describe()" shows the count of non-null values for `col2` within each group defined by `col1`, highlighting variations in data completeness across different groups.


Columns in Villagers dataset:
Index(['survived', 'pclass', 'sex', 'sibsp', 'parch', 'fare', 'embarked',
       'class', 'who', 'adult_male', 'embark_town', 'alive', 'alone'],
      dtype='object')
   survived  pclass     sex  sibsp  parch     fare embarked  class    who  \
0         0       3    male      1      0   7.2500        S  Third    man   
1         1       1  female      1      0  71.2833        C  First  woman   
2         1       3  female      0      0   7.9250        S  Third  woman   
3         1       1  female      1      0  53.1000        S  First  woman   
4         0       3    male      0      0   8.0500        S  Third    man   

   adult_male  embark_town alive  alone  
0        True  Southampton    no  False  
1       False    Cherbourg   yes  False  
2       False  Southampton   yes   True  
3       False  Southampton   yes  False  
4        True  Southampton    no   True  


KeyError: 'species'

In [37]:
# 8.

# 3)
  # B}
import pandas as pd
url = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/villagers.csv"
df = pd.read_csv(url)
df.isna().sum()

# chat gpt is easier

  # C}
rows, columns = df.shape
print(f'Number of rows: {rows}')
print(f'Number of columns: {columns}')

print("Summary of numerical columns:")
print(numerical_summary)

categorical_summary = ['species', 'personality']  
print("\nSummary of categorical columns:")
 #chat gpt is easier

# D}
species_counts = df['species'].value_counts()
print(species_counts)
    #chat gpt is easier
    
# E}
missing_values = df.seletc_dtypes(include='number').isnull().sum()
print("Missing values in numeric columns:")
print(missing_values)
# chat gpt is easier

# F}
species_counts = df['species'].value_counts()
print(age_counts)

# chat gpt is easier

# G}
print("Columns in Villagers dataset:")
print(df.columns)
print(df.head())
age_description_by_species = df.groupby(species)['age'].describe()

# chat gpt is easier

row_n           0
id              1
name            0
gender          0
species         0
birthday        0
personality     0
song           11
phrase          0
full_id         0
url             0
dtype: int64

In [38]:
# 9) yes

In [None]:
# chat gpt link = "https://chatgpt.com/share/66e3b80b-9984-800a-a50d-f93cbe689ca5"