# Meteorite Landings Analysis
### Can we predict where the next likely impact zone will be with the information given ? 

In [None]:
# Importing all the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score

#### Loading the dataset 

In [None]:
# Load dataset into a dataframe
data = pd.read_csv("Meteorite_Landings.csv")

# Printing the shape of the dataset before cleaning
print("Data Shape:", data.shape)

# Display the first few rows
data.head()

### Displaying the Basic Information about the dataset

In [None]:
# Display basic information about the dataset such as the columns, data types, and number of rows
print("Data Information:")
print(data.info())

In [None]:
# Show summary statistics for numeric columns in the dataset
print("Summary Statistics:")
print(data.describe())

#### Identifying Missing Values in the Dataset and Filling in the blanks or dropping them for convinience

In [None]:
# Display the number of missing values in each column of the dataset
print("Missing Values:")
print(data.isnull().sum())

In [None]:
# Drop columns that are not needed
data.drop(columns=['Unnamed: 10', 'GeoLocation'], inplace=True, errors='ignore')

# Rename 'mass (g)' to 'mass' for convenience
data.rename(columns={'mass (g)': 'mass'}, inplace=True)

# Drop rows where reclat or reclong is missing
data.dropna(subset=['reclat', 'reclong'], inplace=True)
after_dropping_targets = data.shape


In [None]:
# Fill missing year values with the median year
data['year'].fillna(data['year'].median(), inplace=True)

# Fill missing mass values with the median mass
data['mass'].fillna(data['mass'].median(), inplace=True)


In [None]:
# Creating a new column 'log-transformed mass' to handle the skewness in the mass column
data['log_mass'] = np.log1p(data['mass'])

In [None]:
# Display the number of missing values in each column of the dataset after cleaning
print("Remaining Missing Values:")
print(data.isnull().sum())

# Print the shape of the cleaned dataset
print("\nData Shape After Cleaning:", data.shape)
data.head()


In [None]:
# Type of features in the dataset
continuous_features = ['log_mass', 'year', 'reclat', 'reclong']
categorical_features = ['fall', 'nametype', 'recclass']

In [None]:
# Plotting the distribution of categorical features
plt.figure(figsize=(12, 10))
for cat_col in categorical_features:
    # Count the occurrences of each category
    value_counts = data[cat_col].value_counts()
    
    # Since 'recclass' has too many categories, we will limit it to the top 10
    value_counts = value_counts.head(10)
    
    # Plot the distribution of the categorical column
    plt.subplot(2, 2, categorical_features.index(cat_col)+1)
    plt.bar(value_counts.index, value_counts.values, color='blue', edgecolor='black')
    plt.title(f"Distribution of '{cat_col}'")
    plt.xlabel(cat_col)
    plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:
# Plotting histograms of continuous features
plt.figure(figsize=(12, 10))
for i, col in enumerate(continuous_features):
    plt.subplot(2, 2, i+1)
    plt.hist(data[col], bins=100, color='blue', edgecolor='black')
    plt.title(f"Histogram of Frequency of Meteorites by {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
plt.tight_layout()
plt.show()


In [None]:
# Plotting boxplots of continuous features
plt.figure(figsize=(12, 10))
for i, col in enumerate(continuous_features):
    plt.subplot(2, 2, i+1)
    plt.boxplot(data[col])
    plt.title(f"Boxplot of {col}")
    plt.xlabel(col)
plt.tight_layout()
plt.show()

## TODO: Add the Cat v Cat, Con v Cat, Con v Con

### Split the Data

In [None]:
# Define feature set X and target variables y (assuming 'reclat' and 'reclong' are targets)
X = data.drop(columns=['reclat', 'reclong'])
y = data[['reclat', 'reclong']]

# Convert categorical features to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

## Training the Models