In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, precision_score, recall_score

In [2]:
# Load data

file_path = 'instance/avocado.csv'
df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


In [4]:
# Display basic information about the data

print("Data Information:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Memory Usage: {df.memory_usage().sum()} bytes")
print(f"Missing Values:\n{df.isnull().sum()}")

Data Information:
Shape: (18249, 14)
Columns: ['Unnamed: 0', 'Date', 'AveragePrice', 'Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year', 'region']
Memory Usage: 2044020 bytes
Missing Values:
Unnamed: 0      0
Date            0
AveragePrice    0
Total Volume    0
4046            0
4225            0
4770            0
Total Bags      0
Small Bags      0
Large Bags      0
XLarge Bags     0
type            0
year            0
region          0
dtype: int64


In [10]:
# Display data types of each column and their counts
data_types = df.dtypes
data_type_counts = data_types.value_counts()

print("Data Types of Each Column:")
print(data_types)
print("\nCount of Each Data Type:")
print(data_type_counts)

Data Types of Each Column:
Unnamed: 0        int64
Date             object
AveragePrice    float64
Total Volume    float64
4046            float64
4225            float64
4770            float64
Total Bags      float64
Small Bags      float64
Large Bags      float64
XLarge Bags     float64
type             object
year              int64
region           object
dtype: object

Count of Each Data Type:
float64    9
object     3
int64      2
Name: count, dtype: int64


In [11]:
missing_values = df.isnull().sum()
print("Missing Values in Each Column:")
print(missing_values)

Missing Values in Each Column:
Unnamed: 0      0
Date            0
AveragePrice    0
Total Volume    0
4046            0
4225            0
4770            0
Total Bags      0
Small Bags      0
Large Bags      0
XLarge Bags     0
type            0
year            0
region          0
dtype: int64


In [12]:
# Numerical summary
numerical_summary = df.describe()
print("Numerical Summary:")
print(numerical_summary)

Numerical Summary:
         Unnamed: 0  AveragePrice  Total Volume          4046          4225  \
count  18249.000000  18249.000000  1.824900e+04  1.824900e+04  1.824900e+04   
mean      24.232232      1.405978  8.506440e+05  2.930084e+05  2.951546e+05   
std       15.481045      0.402677  3.453545e+06  1.264989e+06  1.204120e+06   
min        0.000000      0.440000  8.456000e+01  0.000000e+00  0.000000e+00   
25%       10.000000      1.100000  1.083858e+04  8.540700e+02  3.008780e+03   
50%       24.000000      1.370000  1.073768e+05  8.645300e+03  2.906102e+04   
75%       38.000000      1.660000  4.329623e+05  1.110202e+05  1.502069e+05   
max       52.000000      3.250000  6.250565e+07  2.274362e+07  2.047057e+07   

               4770    Total Bags    Small Bags    Large Bags    XLarge Bags  \
count  1.824900e+04  1.824900e+04  1.824900e+04  1.824900e+04   18249.000000   
mean   2.283974e+04  2.396392e+05  1.821947e+05  5.433809e+04    3106.426507   
std    1.074641e+05  9.862424

In [13]:
# Categorical summary
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_summary = df[categorical_columns].describe()
print("\nCategorical Summary:")
print(categorical_summary)


Categorical Summary:
              Date          type  region
count        18249         18249   18249
unique         169             2      54
top     2015-12-27  conventional  Albany
freq           108          9126     338


In [15]:
# Select numerical columns
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Compute the correlation matrix using only numerical columns
correlation_matrix = df[numerical_columns].corr()

# Display the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

Correlation Matrix:
              Unnamed: 0  AveragePrice  Total Volume      4046      4225  \
Unnamed: 0      1.000000     -0.133008      0.014035  0.017628  0.019829   
AveragePrice   -0.133008      1.000000     -0.192752 -0.208317 -0.172928   
Total Volume    0.014035     -0.192752      1.000000  0.977863  0.974181   
4046            0.017628     -0.208317      0.977863  1.000000  0.926110   
4225            0.019829     -0.172928      0.974181  0.926110  1.000000   
4770            0.041752     -0.179446      0.872202  0.833389  0.887855   
Total Bags     -0.002219     -0.177088      0.963047  0.920057  0.905787   
Small Bags      0.000347     -0.174730      0.967238  0.925280  0.916031   
Large Bags     -0.009196     -0.172940      0.880640  0.838645  0.810015   
XLarge Bags    -0.011546     -0.117592      0.747157  0.699377  0.688809   
year           -0.171667      0.093197      0.017193  0.003353 -0.009559   

                  4770  Total Bags  Small Bags  Large Bags  XLarge 

In [6]:
# Clean data

df = df.dropna()  # Drop rows with missing values
df = df.drop_duplicates()  # Drop duplicate rows

In [18]:
# Display summary statistics
print("\nSummary Statistics:")
print(df.describe())


Summary Statistics:
         Unnamed: 0                           Date  AveragePrice  \
count  18249.000000                          18249  18249.000000   
mean      24.232232  2016-08-13 23:30:43.498273792      1.405978   
min        0.000000            2015-01-04 00:00:00      0.440000   
25%       10.000000            2015-10-25 00:00:00      1.100000   
50%       24.000000            2016-08-14 00:00:00      1.370000   
75%       38.000000            2017-06-04 00:00:00      1.660000   
max       52.000000            2018-03-25 00:00:00      3.250000   
std       15.481045                            NaN      0.402677   

       Total Volume          4046          4225          4770    Total Bags  \
count  1.824900e+04  1.824900e+04  1.824900e+04  1.824900e+04  1.824900e+04   
mean   8.506440e+05  2.930084e+05  2.951546e+05  2.283974e+04  2.396392e+05   
min    8.456000e+01  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
25%    1.083858e+04  8.540700e+02  3.008780e+03  0

In [19]:
# Select features and target
feature_columns = ['4046', '4225', '4770']  # Replace with your feature columns
target_column = 'Total Volume'  # Replace with your target column

X = df[feature_columns]
y = df[target_column]

# Split data into training and testing sets
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

In [20]:
# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [24]:
# Evaluate the model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# For binary classification metrics (assuming target is binary)
y_test_binary = y_test > y_test.median()
y_pred_binary = y_pred > y_test.median()
precision = precision_score(y_test_binary, y_pred_binary)
recall = recall_score(y_test_binary, y_pred_binary)

print("\nModel Performance:")
print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


Model Performance:
R² Score: 0.9878
MAE: 103346.7689
Precision: 0.9767
Recall: 0.9397


In [25]:
# Display feature importance
feature_importance = dict(zip(feature_columns, model.coef_))
print("\nFeature Importance:")
for feature, importance in feature_importance.items():
    print(f"{feature}: {importance:.4f}")


Feature Importance:
4046: 1.4373
4225: 1.3746
4770: 0.2507
