# Projeto_2

## Import of python libraries

In this section there are all the libraries used in the entire file listed so that it may be easy to read

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from math import sqrt
from geopy.geocoders import Nominatim
import datetime
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import warnings
from statsmodels.tsa.arima.model import ARIMA
warnings.filterwarnings("ignore")

## Functions used along the code

All the function used along the code are listed here

In [None]:
def get_city_population(city):
    url = 'https://en.wikipedia.org/wiki/List_of_largest_cities_and_towns_in_Turkey'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', class_='wikitable')
    rows = table.find_all('tr')[1:]  # Exclude the header row
    for row in rows:
        columns = row.find_all('td')
        if columns[1].text.strip() == city:
            population = columns[6].text.strip()
            return population
    return 'Population not found'

def latitude(city):
    geolocator = Nominatim(user_agent="my_app")
    location = geolocator.geocode(city)
    if location is not None:
        latitude = location.latitude
        return latitude
    else:
        return "Latitude not found"
    
def longitude(city):
    geolocator = Nominatim(user_agent="my_app")
    location = geolocator.geocode(city)
    if location is not None:
        longitude = location.longitude
        return longitude
    else:
        return "Longitude not found"
    
def season(date):
    month = date.strftime('%B')
    day = date.day
    if month in ('January', 'February', 'March'):
        season = 'winter'
    elif month in ('April', 'May', 'June'):
        season = 'spring'
    elif month in ('July', 'August', 'September'):
        season = 'summer'
    else:
        season = 'autumn'
    if (month == 'March') and (day > 20):
        season = 'spring'
    elif (month == 'June') and (day > 20):
        season = 'summer'
    elif (month == 'September') and (day > 22):
        season = 'autumn'
    elif (month == 'December') and (day > 21):
        season = 'winter'
    return season

def holidays(date):
    year = date.strftime('%Y')
    formated = date.strftime('%#d %B')
    url = 'https://en.wikipedia.org/wiki/Public_holidays_in_Turkey'
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the table containing the holiday data
    table = soup.find_all('table', 'wikitable')[0]
    rows = table.find_all('tr')[1:]
    for row in rows:
        # Extract the cells of each row
        columns = row.find_all('td')
        if columns[0].text.strip() == formated:
            name = columns[1].text.strip()
            return name
    return 'Not Holiday'

## Data cleaning

In this section some data cleaning will be performed. The city names were corrected, null values were replaced using the interpolate function of pandas, aswell as a k-nearest neighbors algorithm, for the first 2 csv files.  
For the third csv file, some columns were droped, and null values were filed, around 7000 null values remain which means that ir represents less than 1 % of the entire csv file, so we made the decision to drop those lines.

In [None]:
cities_df = pd.read_csv("cities.csv")
cities_df['city_code'] = cities_df['city_code'].str.replace('?', 'i')
cities_df['city_code'] = cities_df['city_code'].str.replace('Sanliurfa', 'Şanlıurfa')
cities_df['city_code'] = cities_df['city_code'].str.replace('Izmir', 'İzmir')
cities_df['city_code'] = cities_df['city_code'].str.replace('Diyarbakir', 'Diyarbakır')
cities_df['city_code'] = cities_df['city_code'].str.replace('Eskiiehir', 'Eskişehir')
cities_df['city_code'] = cities_df['city_code'].str.replace('Adapazari', 'Adapazarı')
cities_df['city_code'] = cities_df['city_code'].str.replace('Kahramanmaras', 'Kahramanmaraş')
cities_df['city_code'] = cities_df['city_code'].str.title()

In [None]:
product_df = pd.read_csv("product.csv")

# Check for nulls and replace missing values
print(product_df.isna().sum())
product_df["product_length"].interpolate(method ='linear', limit_direction ='both', inplace=True)
product_df["product_depth"].interpolate(method ='linear', limit_direction ='both', inplace=True)
product_df["product_width"].interpolate(method ='linear', limit_direction ='both', inplace=True)
print(product_df.isna().sum())

# Split the data into complete and missing
complete_data = product_df.dropna()
missing_data = product_df[product_df['cluster_id'].isnull()].drop('cluster_id', axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    complete_data[['product_length', 'product_depth', 'product_width']],
    complete_data['cluster_id'],
    test_size=0.2,
    random_state=42
)

# Perform K-nearest neighbors classification with hyperparameter tuning
parameters = {
    'n_neighbors': range(1, 21),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50],
    'p': [1, 2]
}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, parameters, cv=5)
grid_search.fit(X_train, y_train)

# Best hyperparameters and model
best_k = grid_search.best_params_['n_neighbors']
best_weights = grid_search.best_params_['weights']
best_algorithm = grid_search.best_params_['algorithm']
best_leaf_size = grid_search.best_params_['leaf_size']
best_p = grid_search.best_params_['p']
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Predict clusters for missing data using the best model
missing_data['cluster_id'] = best_model.predict(missing_data[['product_length', 'product_depth', 'product_width']])

# Concatenate the complete data with the imputed missing data
result_df = pd.concat([complete_data, missing_data])

# Print the resulting DataFrame with clusters
print(result_df)
print(result_df.isna().sum())

result_df.to_csv("product_df.csv", index=False)

In [None]:
sales_df = pd.read_csv("sales.csv")
sales_df = sales_df[sales_df["date"] < "2019-10-01"]
sales_df = sales_df.drop(sales_df.columns[0], axis=1)
print(sales_df.isna().sum())

#fill the price
sales_df['price'] = sales_df.groupby('product_id')['price'].transform(lambda x: x.fillna(x.mean()))
print(sales_df.isna().sum())
#after this some nulls still remain, that means that the dataset contains products for which there isnt a price in any place

#drop columns with a lot of nulls
sales_df.drop(['promo_bin_2', 'promo_discount_2', 'promo_discount_type_2'], axis=1, inplace=True)

#count distinct values of a column
value_counts = sales_df['promo_type_2'].value_counts()
print(value_counts)

# Given that less than one percent of the data related to promo_type_2 is different from the majority of the values
# we will drop this column aswell
sales_df.drop(['promo_type_2'], axis=1, inplace=True)

value_counts = sales_df['promo_type_1'].value_counts()
print(value_counts)

value_counts = sales_df['promo_bin_1'].value_counts()
print(value_counts)

sales_df['promo_bin_1'].fillna("none", inplace=True)

#binary_cols = pd.get_dummies(sales_df['promo_bin_1'], prefix='promo_bin_')
#sales_df = pd.concat([sales_df, binary_cols], axis=1)

#binary_cols = pd.get_dummies(sales_df['promo_type_1'], prefix='promo_type_')
#sales_df = pd.concat([sales_df, binary_cols], axis=1)

#sales_df.drop(['promo_type_1', 'promo_bin_1'], axis=1, inplace=True)

# Dropping the remaing null values of the price its around 7000 so less then 1% see after model if it affects a lot
sales_df.dropna(inplace=True)
print(sales_df.isna().sum())
sales_df.head()
sales_df.to_csv("sales_df.csv", index=False)

# Adding external variables 

In [None]:
# Add a new column to store population
cities_df['Population'] = cities_df['city_code'].apply(get_city_population)
cities_df['Latitude'] = cities_df['city_code'].apply(latitude)
cities_df['Longitude'] = cities_df['city_code'].apply(longitude)

print(cities_df)

#It doesnt work for Izmir so replace manually
cities_df['Population'] = cities_df['Population'].str.replace('Population not found', '2,847,691')

# remove the commas
cities_df['Population'] = cities_df['Population'].str.replace(',', '')

cities_df.to_csv("cities_df.csv", index=False)

In [None]:
df_1 = pd.read_csv("sales_df.csv")
df_1 = df_1[['date']].copy()
df_1.drop_duplicates(inplace=True)
df_1['date'] = pd.to_datetime(df_1['date'])
print(df_1)
df_1['season'] = df_1['date'].apply(season)
print(df_1)

In [None]:
df_1['holidays'] = df_1['date'].apply(holidays)
print(df_1)

#change some values manually
df_1.to_csv("season_holidays.csv", index=False)

# Graphs

In [None]:
df = pd.read_csv('sales_df.csv')
df['date'] = pd.to_datetime(df['date'])
aggregated_df = df.groupby(['date', 'store_id']).agg({'sales': 'sum'}).reset_index()

grouped_df = aggregated_df.groupby('store_id')
for store_id, group_data in grouped_df:
    # Perform time series analysis on the group_data DataFrame
    # You can use methods like rolling mean, exponential smoothing, etc.
    # Example:
    rolling_mean = group_data['sales'].rolling(window=7).mean()
    
    # Plot the time series
    plt.figure()
    plt.plot(group_data['date'], group_data['sales'], label='Sales')
    plt.plot(group_data['date'], rolling_mean, label='Rolling Mean')
    plt.title(f"Store ID: {store_id}")
    plt.xlabel('Date')
    plt.ylabel('Sales')
    plt.legend()
    plt.show()

## Transforming data for future analysis

In [None]:
"""
unique_combinations = sales_df[['store_id', 'date']].drop_duplicates()
merged_df = unique_combinations.merge(cities_df, on='store_id', how='left')
print(merged_df)
unique_combinations_2 = merged_df[['city_code', 'date']].drop_duplicates()
print(unique_combinations_2)
print(len(unique_combinations_2))
"""

In [None]:
"""
sales_df['date'] = pd.to_datetime(sales_df['date'])  # Convert 'date' column to datetime type

future_ts = sales_df.groupby([pd.Grouper(key='date', freq='W-MON'), 'store_id']) \
    .agg(total_sales=('sales', 'sum'),
         total_revenue=('revenue', 'sum'),
         total_stock=('stock', 'sum'),
         average_price=('price', 'mean'),
         promo_type_count=('promo_type_1', 'nunique'),
         promo_bin_count=('promo_bin_1', 'nunique')) \
    .round()

print(future_ts)
future_ts.head
"""

# TESTING AREA

## RANDOM FOREST

In [None]:
# Carregar os dados
sales_df = pd.read_csv("sales_df.csv")
sales_df['date'] = pd.to_datetime(sales_df['date'])
one_store = sales_df.query("store_id == 'S0002'")
one_store.drop(['store_id'], axis=1, inplace=True)
future_ts = one_store.groupby([pd.Grouper(key='date', freq='W-MON')]) \
    .agg(total_sales=('sales', 'sum'),
         total_revenue=('revenue', 'sum'),
         average_price=('price', 'mean'),
         promo_type_count=('promo_type_1', 'nunique'),
         promo_bin_count=('promo_bin_1', 'nunique')) \
    .round()

train_data = future_ts[:-2]  
test_data = future_ts[-2:] 

X_train = train_data[['total_revenue', 'average_price', 'promo_type_count','promo_bin_count']]  
y_train = train_data['total_sales'] 

X_test = train_data[['total_revenue', 'average_price', 'promo_type_count','promo_bin_count']]  
y_test = train_data['total_sales']  


#Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Avaliação do modelo
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'RMSE: {rmse}')

# Tunning do modelo

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Criar o modelo de ensemble (Random Forest)
model_bp = RandomForestRegressor(random_state=42)

# Realizar busca em grade para encontrar os melhores parâmetros
grid_search = GridSearchCV(estimator=model_bp, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Melhores parâmetros encontrados
best_params = grid_search.best_params_
print('Melhores parâmetros:', best_params)

# Avaliação do modelo com os melhores parâmetros
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'RMSE: {rmse}')

## Auto Arima weekly (number of sales)

In [3]:
import pandas as pd

df = pd.read_csv("sales_df.csv")
df['date'] = pd.to_datetime(df['date'])
grouped_df = df.groupby([pd.Grouper(key='date', freq='W-MON'), 'store_id']).agg({'sales': 'sum'}).reset_index()

# Count the number of weeks in each year for each store
week_counts = grouped_df.groupby(['store_id', grouped_df['date'].dt.year]).size().reset_index(name='week_count')
print(week_counts)
week_counts.to_csv("week_counts.csv", index=False)


    store_id  date  week_count
0      S0002  2017          52
1      S0002  2018          53
2      S0002  2019          39
3      S0003  2017          52
4      S0003  2018          53
..       ...   ...         ...
174    S0142  2018          53
175    S0142  2019          39
176    S0143  2017          52
177    S0143  2018          53
178    S0143  2019          39

[179 rows x 3 columns]


In [None]:
import pandas as pd
from pmdarima import auto_arima
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Step 1: Load and preprocess the dataset
df = pd.read_csv("sales_df.csv")
df['date'] = pd.to_datetime(df['date'])

# Step 2: Group the data by store and week
grouped_df = df.groupby([pd.Grouper(key='date', freq='W-MON'), 'store_id']).agg({'sales': 'sum'}).reset_index()

# Step 3: Split the data into training and testing sets
test_start_week = grouped_df['date'].max() - pd.DateOffset(weeks=4)
print(test_start_week)
train_data = grouped_df[grouped_df['date'] <= test_start_week]
print(train_data)
test_data = grouped_df[grouped_df['date'] > test_start_week]
print(test_data)

# Step 4: Perform autoarima on each store's time series
predictions = pd.DataFrame()
for store, store_data in train_data.groupby('store_id'):
    print(store)
    print(store_data)
    model = auto_arima(store_data['sales'], )
    model.fit(store_data['sales'])
    forecast_start_week = train_data[train_data['store_id'] == store]['date'].max() + pd.DateOffset(weeks=1)
    print(forecast_start_week)
    future_weeks = len(test_data[test_data['store_id'] == store])
    print(future_weeks)
    future_forecast = model.predict(n_periods=future_weeks, start=forecast_start_week)
    print(future_forecast)
    predictions = predictions.append(pd.DataFrame(future_forecast, columns=[store]).set_index(test_data[test_data['store_id'] == store].index))
    index=test_data[test_data['store_id'] == store].index,
    print(predictions)

    # Step 5: Evaluate the model performance
    actual_sales = test_data.loc[test_data['store_id'] == store, 'sales']
    print(actual_sales)
    predicted_sales = predictions.loc[test_data['store_id'] == store, store]
    print(predicted_sales)


    mae = mean_absolute_error(actual_sales, predicted_sales)
    rmse = mean_squared_error(actual_sales, predicted_sales, squared=False)
    mape = (abs((actual_sales - predicted_sales) / actual_sales)).mean() * 100

    print(f"Metrics for Store {store}:")
    print(f"MAE: {mae}")
    print(f"RMSE: {rmse}")
    print(f"MAPE: {mape}%")

# Step 6: Visualize the results
for store in predictions.columns:
    plt.figure()
    plt.plot(train_data[train_data['store_id'] == store]['sales'], label='Train')
    plt.plot(test_data[test_data['store_id'] == store]['sales'], label='Test')
    plt.plot(predictions.loc[test_data['store_id'] == store, store], label='Predicted')
    plt.title(f"Sales Prediction - Store {store}")
    plt.xlabel("Week")
    plt.ylabel("Sales")
    plt.legend()
    plt.show()


## Auto Arima Weekly (revenue)

In [None]:
import pandas as pd
from pmdarima import auto_arima
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Step 1: Load and preprocess the dataset
df = pd.read_csv("sales_df.csv")
df['date'] = pd.to_datetime(df['date'])

# Step 2: Group the data by store and week
grouped_df = df.groupby([pd.Grouper(key='date', freq='W-MON'), 'store_id']).agg({'revenue': 'sum'}).reset_index()

# Step 3: Split the data into training and testing sets
test_start_week = grouped_df['date'].max() - pd.DateOffset(weeks=4)
print(test_start_week)
train_data = grouped_df[grouped_df['date'] <= test_start_week]
print(train_data)
test_data = grouped_df[grouped_df['date'] > test_start_week]
print(test_data)

# Step 4: Perform autoarima on each store's time series
predictions = pd.DataFrame()
for store, store_data in train_data.groupby('store_id'):
    print(store)
    print(store_data)
    model = auto_arima(store_data['revenue'])
    model.fit(store_data['revenue'])
    forecast_start_week = train_data[train_data['store_id'] == store]['date'].max() + pd.DateOffset(weeks=1)
    print(forecast_start_week)
    future_weeks = len(test_data[test_data['store_id'] == store])
    print(future_weeks)
    future_forecast = model.predict(n_periods=future_weeks, start=forecast_start_week)
    print(future_forecast)
    predictions = predictions.append(pd.DataFrame(future_forecast, columns=[store]).set_index(test_data[test_data['store_id'] == store].index))
    index=test_data[test_data['store_id'] == store].index,
    print(predictions)

    # Step 5: Evaluate the model performance
    actual_revenue = test_data.loc[test_data['store_id'] == store, 'revenue']
    print(actual_revenue)
    predicted_revenue = predictions.loc[test_data['store_id'] == store, store]
    print(predicted_revenue)

    mae = mean_absolute_error(actual_revenue, predicted_revenue)
    rmse = mean_squared_error(actual_revenue, predicted_revenue, squared=False)
    mape = (abs((actual_revenue - predicted_revenue) / actual_revenue)).mean() * 100

    print(f"Metrics for Store {store}:")
    print(f"MAE: {mae}")
    print(f"RMSE: {rmse}")
    print(f"MAPE: {mape}%")

# Step 6: Visualize the results
for store in predictions.columns:
    plt.figure()
    plt.plot(train_data[train_data['store_id'] == store]['revenue'], label='Train')
    plt.plot(test_data[test_data['store_id'] == store]['revenue'], label='Test')
    plt.plot(predictions.loc[test_data['store_id'] == store, store], label='Predicted')
    plt.title(f"Revenue Prediction - Store {store}")
    plt.xlabel("Week")
    plt.ylabel("Revenue")
    plt.legend()
    plt.show()
