In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Data handling and processing
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Model selection and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Machine learning models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Set up for inline plotting
%matplotlib inline

# Set global styling options for visualizations
sns.set(style='whitegrid', palette='muted')


In [None]:
train_keyword=pd.read_csv('/kaggle/input/can-i-be-an-influencer/train_keywords.csv')
train_keyword.head()
len(np.unique(train_keyword['image_id']))/len(train_keyword)

In [None]:
def top_5_matches(train_keyword):
    top_5 = train_keyword.sort_values('confidence_value', ascending=False).head(5)
    matches = train_keyword[train_keyword.duplicated(subset='classification_keyword', keep=False)]
    return pd.concat([top_5, matches], ignore_index=True)

# group by ID and apply the function to each group
output = train_keyword.groupby('image_id').apply(top_5_matches)
output.index=range(len(output))

In [None]:
train_keyword.drop(["image_classification"],axis=1,inplace=True)

In [None]:
grouped_key = output.groupby('image_id')['classification_keyword'].agg(list).to_frame().reset_index()
grouped_conf = output.groupby('image_id')['confidence_value'].agg(list).to_frame().reset_index()

In [None]:
result = pd.merge(grouped_key, grouped_conf, on='image_id')

In [None]:
import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Set the path to the dataset you added
BERT_MODEL_PATH = '/kaggle/input/bert-base-uncased'

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH)

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(BERT_MODEL_PATH)


In [None]:
def word_embed(input_text):
    tokens = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, max_length=512, return_tensors='pt')
    with torch.no_grad():
        outputs = model.bert(tokens)
        embeddings = outputs[0][0]
    feature = embeddings.mean(dim=0)[:10]

    return feature


In [None]:
p={x for l in list(result["classification_keyword"]) for x in l}
from tqdm import tqdm
dict_val={}
for i in tqdm(list(p)):
    dict_val[i]=word_embed(i)

In [None]:
from tqdm import tqdm
c = result["classification_keyword"]
n = len(result)
features = np.zeros((n, 10))

for i in tqdm(range(n)):
    val = np.array([dict_val[w] for w in c[i]])
    features[i] = np.sum(val, axis=0) / 5

In [None]:
result['features']=features.tolist()
result = pd.concat([result.drop('features', axis=1), result['features'].apply(lambda x: pd.Series(x))], axis=1)

In [None]:
result.drop(["classification_keyword","confidence_value"],axis=1,inplace=True)

In [None]:
result.columns=['image_id', "k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7", "k8", "k9"]

In [None]:
result

Preprocessing Train Colors

In [None]:
train_df = pd.read_csv('/kaggle/input/can-i-be-an-influencer/train.csv')
train_colors_df = pd.read_csv('/kaggle/input/can-i-be-an-influencer/train_colors.csv')

In [None]:
num_unique_keywords = train_colors_df['keyword'].nunique()
print(f'Number of unique keywords(colors) in train_colors: {num_unique_keywords}')

In [None]:
keyword_counts = train_colors_df['keyword'].value_counts()
threshold = 0.05 * train_colors_df['image_id'].nunique()
keywords_to_keep = keyword_counts[keyword_counts >= threshold].index
filtered_train_colors = train_colors_df[train_colors_df['keyword'].isin(keywords_to_keep)]
print(filtered_train_colors.head())

In [None]:
color_map = {
    'AliceBlue': 'LightBlue',
    'AntiqueWhite': 'Beige',
    'Aqua': 'Cyan',
    'Aquamarine': 'Green',
    'Azure': 'LightBlue',
    'Beige': 'Tan',
    'Bisque': 'Tan',
    'Black': 'Black',
    'BlanchedAlmond': 'Tan',
    'Blue': 'Blue',
    'BlueViolet': 'Purple',
    'Brown': 'Brown',
    'BurlyWood': 'Tan',
    'CadetBlue': 'Blue',
    'Chartreuse': 'Green',
    'Chocolate': 'Brown',
    'Coral': 'Orange',
    'CornflowerBlue': 'Blue',
    'Cornsilk': 'Beige',
    'Crimson': 'Red',
    'Cyan': 'Cyan',
    'DarkBlue': 'Blue',
    'DarkCyan': 'Cyan',
    'DarkGoldenrod': 'Brown',
    'DarkGray': 'Gray',
    'DarkGrey': 'Gray',
    'DarkGreen': 'Green',
    'DarkKhaki': 'Tan',
    'DarkMagenta': 'Purple',
    'DarkOliveGreen': 'Green',
    'DarkOrange': 'Orange',
    'DarkOrchid': 'Purple',
    'DarkRed': 'Red',
    'DarkSalmon': 'Orange',
    'DarkSeaGreen': 'Green',
    'DarkSlateBlue': 'Blue',
    'DarkSlateGray': 'Gray',
    'DarkSlateGrey': 'Gray',
    'DarkTurquoise': 'Cyan',
    'DarkViolet': 'Purple',
    'DeepPink': 'Pink',
    'DeepSkyBlue': 'Blue',
    'DimGray': 'Gray',
    'DimGrey': 'Gray',
    'DodgerBlue': 'Blue',
    'FireBrick': 'Red',
    'FloralWhite': 'Beige',
    'ForestGreen': 'Green',
    'Fuchsia': 'Purple',
    'Gainsboro': 'Gray',
    'GhostWhite': 'White',
    'Gold': 'Yellow',
    'Goldenrod': 'Yellow',
    'Gray': 'Gray',
    'Grey': 'Gray',
    'Green': 'Green',
    'GreenYellow': 'Green',
    'Honeydew': 'LightGreen',
    'HotPink': 'Pink',
    'IndianRed': 'Red',
    'Indigo': 'Purple',
    'Ivory': 'White',
    'Khaki': 'Tan',
    'Lavender': 'Purple',
    'LavenderBlush': 'Pink',
    'LawnGreen': 'Green',
    'LemonChiffon': 'Yellow',
    'LightBlue': 'LightBlue',
    'LightCoral': 'Pink',
    'LightCyan': 'Cyan',
    'LightGoldenrodYellow': 'Yellow',
    'LightGray': 'Gray',
    'LightGrey': 'Gray',
    'LightGreen': 'LightGreen',
    'LightPink': 'Pink',
    'LightSalmon': 'Orange',
    'LightSeaGreen': 'Green',
    'LightSkyBlue': 'LightBlue',
    'LightSlateGray': 'Gray',
    'LightSlateGrey': 'Gray',
    'LightSteelBlue': 'LightBlue',
    'LightYellow': 'Yellow',
    'Lime': 'Green',
    'LimeGreen': 'Green',
    'Linen': 'Beige',
    'Magenta': 'Purple',
    'Maroon': 'Red',
    'MediumAquamarine': 'Green',
    'MediumBlue': 'Blue',
    'MediumOrchid': 'Purple',
    'MediumPurple': 'Purple',
    'MediumSeaGreen': 'Green',
    'MediumSlateBlue': 'Blue',
    'MediumSpringGreen': 'Green',
    'MediumTurquoise': 'Cyan',
    'MediumVioletRed': 'Red',
    'MidnightBlue': 'Blue',
    'MintCream': 'LightGreen',
    'MistyRose': 'Pink',
    'Moccasin': 'Tan',
    'NavajoWhite': 'Tan',
    'Navy': 'Blue',
    'OldLace': 'Beige',
    'Olive': 'Green',
    'OliveDrab': 'Green',
    'Orange': 'Orange',
    'OrangeRed': 'Red',
    'Orchid': 'Purple',
    'PaleGoldenrod': 'Yellow',
    'PaleGreen': 'LightGreen',
    'PaleTurquoise': 'Cyan',
    'PaleVioletRed': 'Pink',
    'PapayaWhip': 'Yellow',
    'PeachPuff': 'Orange',
    'Peru': 'Brown',
    'Pink': 'Pink',
    'Plum': 'Purple',
    'PowderBlue': 'LightBlue',
    'Purple': 'Purple',
    'Red': 'Red',
    'RosyBrown': 'Brown',
    'RoyalBlue': 'Blue',
    'SaddleBrown': 'Brown',
    'Salmon': 'Orange',
    'SandyBrown': 'Brown',
    'SeaGreen': 'Green',
    'Seashell': 'Beige',
    'Sienna': 'Brown',
    'Silver': 'Gray',
    'SkyBlue': 'LightBlue',
    'SlateBlue': 'Blue',
    'SlateGray': 'Gray',
    'SlateGrey': 'Gray',
    'Snow': 'White',
    'SpringGreen': 'Green',
    'SteelBlue': 'Blue',
    'Tan': 'Tan',
    'Teal': 'Green',
    'Thistle': 'Purple',
    'Tomato': 'Red',
    'Turquoise': 'Cyan',
    'Violet': 'Purple',
    'Wheat': 'Tan',
    'White': 'White',
    'WhiteSmoke': 'Gray',
    'Yellow': 'Yellow',
    'YellowGreen': 'Green'
}

In [None]:
color_map = {key.lower(): value for key, value in color_map.items()}
filtered_train_colors['keyword'] = filtered_train_colors['keyword'].map(lambda x: color_map.get(x, x))

In [None]:
colors_pivot = filtered_train_colors.pivot_table(
    index='image_id',
    columns='keyword',
    values='color_score',
    fill_value=0,
).reset_index()
colors_pivot

In [None]:
merged_df = train_df.merge(colors_pivot, on='image_id')
merged_df.head()

In [None]:
columns_to_remove = ['photo_url', 'photo_image_url', 'photo_submitted_at', 'description1', 'description2', 'latitude', 'longitude', 'camera_model']
merged_df = merged_df.drop(columns=columns_to_remove)

In [None]:
#Dealing with NAs in 2 columns by replacing with most common value as there were less NAs
merged_df['camera_make'].fillna(merged_df['camera_make'].mode()[0], inplace=True)
merged_df['iso'].fillna(merged_df['iso'].mean(), inplace=True)
merged_df['aperture_value'].fillna(merged_df['aperture_value'].mean(), inplace=True)
# Convert exposure_time to numeric
merged_df['exposure_time'] = pd.to_numeric(merged_df['exposure_time'], errors='coerce')

# Fill missing values with the mean
merged_df['exposure_time'].fillna(merged_df['exposure_time'].mean(), inplace=True)
# Setting the image_id column as the index
merged_df.set_index('image_id', inplace=True)

In [None]:
merged_df

In [None]:
def standardize_camera_make(df, pattern, standardized_name):
    df.loc[df['camera_make'].str.contains(pattern, case=False, na=False), 'camera_make'] = standardized_name

patterns_to_standardize = [
    (r'\b[Cc][Aa][Nn][Oo][Nn]\b', 'Canon'),
    (r'\b[Nn][Ii][Kk][Oo][Nn]\b', 'Nikon'),
    (r'\b[Ss][Aa][Mm][Ss][Uu][Nn][Gg]\b', 'Samsung'),
    (r'\b[Ss][Oo][Nn][Yy]\b', 'Sony'),
    (r'\b[Pp][Ee][Nn][Tt][Aa][Xx]\b', 'Pentax'),
    (r'\b[Mm][Oo][Tt][Oo][Rr][Oo][Ll][Aa]\b', 'Motorola'),
    (r'\b[Ff][Uu][Jj][Ii][Ff][Ii][Ll][Mm]\b', 'Fujifilm'),
    (r'\b[Oo][Ll][Yy][Mm][Pp][Uu][Ss]\b', 'Olympus'),
    (r'\b[Ll][Ee][Ii][Cc][Aa]\b', 'Leica'),
    (r'\b[Hh][Aa][Ss][Ss][Ee][Ll][Bb][Ll][Aa][Dd]\b', 'Hasselblad'),
    (r'\b[Dd][Jj][Ii]\b', 'DJI'),
    (r'\b[Gg][Oo][Pp][Rr][Oo]\b', 'GoPro'),
]

for pattern, standardized_name in patterns_to_standardize:
    standardize_camera_make(merged_df, pattern, standardized_name)

merged_df['camera_make'] = merged_df['camera_make'].replace(['XIAOYI'], 'YI TECHNOLOGY')

In [None]:
import re
def replace_zwo(name):
    if re.search("ZWO", name, re.IGNORECASE):
        return "ZWO"
    return name

merged_df['camera_make'] = merged_df['camera_make'].apply(replace_zwo)

def replace_google(name):
    if re.search("Google", name, re.IGNORECASE):
        return "Google"
    return name

merged_df['camera_make'] = merged_df['camera_make'].apply(replace_google)

In [None]:
merged_df

In [None]:
result.set_index('image_id',inplace=True)

In [None]:
merged_df=merged_df.merge(result,how='inner',left_index=True,right_index=True)

In [None]:
merged_df

In [None]:
merged_df.columns

In [None]:
merged_df.groupby(by = "obs_day")['stats_downloads'].mean().plot(kind = "bar")

In [None]:
merged_df.groupby(by = "obs_hour")['stats_downloads'].mean().plot(kind = "bar")

In [None]:
bins = [-1, 5, 11, 16, 20, 23]
labels = ['night', 'morning', 'afternoon', 'evening', 'night']
merged_df['time_of_day'] = pd.cut(merged_df['obs_hour'], bins=bins, labels=labels,ordered=False)

In [None]:
merged_df.drop(["obs_day","obs_hour","obs_min","obs_sec"],axis=1,inplace=True)

In [None]:
merged_df.groupby(by = "image_category")['stats_downloads'].mean().plot(kind = "bar")

In [None]:
merged_df['camera_make'] = merged_df['camera_make'].replace('LG Electronics', 'LGE')
merged_df['camera_make'] = merged_df['camera_make'].replace('Fujica', 'Fujifilm')
merged_df['camera_make'] = merged_df['camera_make'].replace('Xiaomi', 'XIAOMI')
merged_df['camera_make'] = merged_df['camera_make'].replace('Cannon', 'Canon')
merged_df['camera_make'] = merged_df['camera_make'].replace('HUAWEI', 'Huawei')

In [None]:
np.unique(merged_df["camera_make"])

In [None]:
merged_df.groupby(by = "camera_make")['stats_downloads'].mean().plot(kind = "bar")

In [None]:
merged_df.isnull().sum()

In [None]:
merged_df.drop(["country","city"],axis=1,inplace=True)

In [None]:
merged_df.dtypes

In [None]:
cols_to_transform=['image_category',"time_of_day"]
dummies = pd.get_dummies(merged_df[cols_to_transform])
merged_df = pd.concat([merged_df, dummies], axis=1)



In [None]:
merged_df.columns

In [None]:
# drop the original categorical columns
merged_df.drop(cols_to_transform, axis=1, inplace=True)

In [None]:
merged_df.drop(['aspect_ratio','camera_make', 'exposure_time'],axis=1,inplace=True)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
mm = RobustScaler()
c=['width', 'height',  'iso', 'aperture_value',
       'focal_length', 'total_days',
       'Black', 'Blue', 'Brown', 'Gray', 'Green', 'LightBlue', 'Purple', 'Red',
       'Tan', 'Yellow', 'k0', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'k7', 'k8',
       'k9']
merged_df[c] = mm.fit_transform(merged_df[c])

In [None]:
X=merged_df.drop(['stats_downloads'],axis=1)
y=merged_df['stats_downloads']

In [None]:
sns.heatmap(X[c].corr(),annot=True,cmap='RdYlGn',linewidths=0.2) 
fig=plt.gcf()
fig.set_size_inches(30,8)
plt.show()


In [None]:
#from sklearn.model_selection import train_test_split
#X_train,X_test,y_train,y_test = train_test_split(X,y, random_state = 100)

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Perform ANOVA feature selection
selector = SelectKBest(f_regression, k=25)
X_new = selector.fit_transform(X, y)

# Get the column indices of the selected features
selected_features = selector.get_support(indices=True)

# Create a DataFrame with only the selected features
X_selected = pd.DataFrame(X_new, columns=[X.columns[i] for i in selected_features])

# Split the selected features into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Train the linear regression model on the training data
reg = LinearRegression().fit(X_train, y_train)

# Make predictions on the test data
y_pred = reg.predict(X_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2:", r2)

# Calculate the adjusted R^2 score
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
print("Adjusted R^2:", adjusted_r2)


In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Train the SVR model on the training data
reg = SVR().fit(X_train, y_train)

# Make predictions on the test data
y_pred = reg.predict(X_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2:", r2)

# Calculate the adjusted R^2 score
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
print("Adjusted R^2:", adjusted_r2)


In [None]:
!pip install xgboost

import xgboost as xgb
# Train the XGBoost model on the training data
reg = xgb.XGBRegressor().fit(X_train, y_train)

# Make predictions on the test data
y_pred = reg.predict(X_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2:", r2)

# Calculate the adjusted R^2 score
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
print("Adjusted R^2:", adjusted_r2)


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train the Random Forest model on the training data
reg = RandomForestRegressor().fit(X_train, y_train)

# Make predictions on the test data
y_pred = reg.predict(X_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2:", r2)

# Calculate the adjusted R^2 score
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
print("Adjusted R^2:", adjusted_r2)


In [None]:
!pip install catboost

import catboost as cb

# Train the CatBoost model on the training data
reg = cb.CatBoostRegressor().fit(X_train, y_train)

# Make predictions on the test data
y_pred = reg.predict(X_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2:", r2)

# Calculate the adjusted R^2 score
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
print("Adjusted R^2:", adjusted_r2)


In [None]:
from sklearn.linear_model import Lasso

# Train the Lasso model on the training data
reg = Lasso().fit(X_train, y_train)

# Make predictions on the test data
y_pred = reg.predict(X_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2:", r2)

# Calculate the adjusted R^2 score
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
print("Adjusted R^2:", adjusted_r2)


In [None]:
from sklearn.linear_model import Ridge

# Train the Ridge model on the training data
reg = Ridge().fit(X_train, y_train)

# Make predictions on the test data
y_pred = reg.predict(X_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2:", r2)

# Calculate the adjusted R^2 score
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
print("Adjusted R^2:", adjusted_r2)


In [None]:
from sklearn.linear_model import ElasticNet

# Train the ElasticNet model on the training data
reg = ElasticNet().fit(X_train, y_train)

# Make predictions on the test data
y_pred = reg.predict(X_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2:", r2)

# Calculate the adjusted R^2 score
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
print("Adjusted R^2:", adjusted_r2)


In [None]:
import tensorflow as tf
# Define the model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=100, verbose=0)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2:", r2)

# Calculate the adjusted R^2 score
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
print("Adjusted R^2:", adjusted_r2)


In [None]:
X_selected

In [None]:
X_selected.columns

**Test data processing**|

In [None]:
mean_aperture = merged_df['aperture_value'].mean()
mean_aperture

In [None]:
test_keyword=pd.read_csv('/kaggle/input/can-i-be-an-influencer/test_keywords.csv')
def top_5_matches(test_keyword):
    top_5 = test_keyword.sort_values('confidence_value', ascending=False).head(5)
    matches = test_keyword[test_keyword.duplicated(subset='classification_keyword', keep=False)]
    return pd.concat([top_5, matches], ignore_index=True)

# group by ID and apply the function to each group
output = test_keyword.groupby('image_id').apply(top_5_matches)
output.index=range(len(output))

test_keyword.drop(["image_classification"],axis=1,inplace=True)

grouped_key = output.groupby('image_id')['classification_keyword'].agg(list).to_frame().reset_index()
grouped_conf = output.groupby('image_id')['confidence_value'].agg(list).to_frame().reset_index()

result = pd.merge(grouped_key, grouped_conf, on='image_id')

import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Set the path to the dataset you added
BERT_MODEL_PATH = '/kaggle/input/bert-base-uncased'

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH)

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(BERT_MODEL_PATH)


def word_embed(input_text):
    tokens = tokenizer.encode(input_text, add_special_tokens=True, truncation=True, max_length=512, return_tensors='pt')
    with torch.no_grad():
        outputs = model.bert(tokens)
        embeddings = outputs[0][0]
    feature = embeddings.mean(dim=0)[:10]

    return feature


In [None]:
p={x for l in list(result["classification_keyword"]) for x in l}
from tqdm import tqdm
dict_val={}
for i in tqdm(list(p)):
    dict_val[i]=word_embed(i)



In [None]:
from tqdm import tqdm
c = result["classification_keyword"]
n = len(result)
features = np.zeros((n, 10))

for i in tqdm(range(n)):
    val = np.array([dict_val[w] for w in c[i]])
    features[i] = np.sum(val, axis=0) / 5

result['features']=features.tolist()
result = pd.concat([result.drop('features', axis=1), result['features'].apply(lambda x: pd.Series(x))], axis=1)
result.drop(["classification_keyword","confidence_value"],axis=1,inplace=True)
result.columns=['image_id', "k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7", "k8", "k9"]




In [None]:
result.shape

In [None]:
test_df = pd.read_csv('/kaggle/input/can-i-be-an-influencer/test.csv')
test_colors_df = pd.read_csv('/kaggle/input/can-i-be-an-influencer/test_colors.csv')
keyword_counts = test_colors_df['keyword'].value_counts()
threshold = 0.00 * test_colors_df['image_id'].nunique()
keywords_to_keep = keyword_counts[keyword_counts >= threshold].index
filtered_train_colors = test_colors_df[test_colors_df['keyword'].isin(keywords_to_keep)]
print(filtered_train_colors.shape)

In [None]:
color_map = {key.lower(): value for key, value in color_map.items()}
filtered_train_colors['keyword'] = filtered_train_colors['keyword'].map(lambda x: color_map.get(x, x))
colors_pivot = filtered_train_colors.pivot_table(
    index='image_id',
    columns='keyword',
    values='color_score',
    fill_value=0,
).reset_index()
colors_pivot

In [None]:
test_merged_df = test_df.merge(colors_pivot, on='image_id', how='left')
test_merged_df.fillna(0, inplace=True)
test_merged_df.head()
columns_to_remove = ['photo_url', 'photo_image_url', 'photo_submitted_at', 'description1', 'description2', 'latitude', 'longitude', 'camera_model']
test_merged_df = test_merged_df.drop(columns=columns_to_remove)
#Dealing with NAs in 2 columns by replacing with most common value as there were less NAs
test_merged_df['camera_make'].fillna(test_merged_df['camera_make'].mode()[0], inplace=True)
test_merged_df['iso'].fillna(test_merged_df['iso'].mean(), inplace=True)
test_merged_df['aperture_value'].fillna(test_merged_df['aperture_value'].mean(), inplace=True)
# Convert exposure_time to numeric
test_merged_df['exposure_time'] = pd.to_numeric(test_merged_df['exposure_time'], errors='coerce')
test_merged_df['aperture_value'].fillna(mean_aperture, inplace=True)

In [None]:
bins = [-1, 5, 11, 16, 20, 23]
labels = ['night', 'morning', 'afternoon', 'evening', 'night']
test_merged_df['time_of_day'] = pd.cut(test_merged_df['obs_hour'], bins=bins, labels=labels,ordered=False)

In [None]:
test_merged_df.set_index('image_id', inplace=True)
result.set_index('image_id',inplace=True)
test_merged_df=test_merged_df.merge(result,how='inner',left_index=True,right_index=True)

In [None]:
test_merged_df

In [None]:
c=['width', 'height',  'iso', 'aperture_value',
       'focal_length', 'total_days',
       'Black', 'Blue', 'Brown', 'Gray', 'Green', 'LightBlue', 'Purple', 'Red',
       'Tan', 'Yellow', 'k0', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'k7', 'k8',
       'k9']
test_merged_df[c] = mm.transform(test_merged_df[c])

In [None]:
list(X_selected.columns)

In [None]:
test_merged_df.columns

In [None]:
cols_to_transform=['image_category','time_of_day']
dummies = pd.get_dummies(test_merged_df[cols_to_transform])
test_merged_df1 = pd.concat([test_merged_df, dummies], axis=1)



In [None]:
test_subset_merged1 = test_merged_df1[list(X_selected.columns)]

In [None]:
test_subset_merged1.head()

In [None]:
df_test_pred = pd.DataFrame()

In [None]:
df_test_pred['image_id'] = (test_merged_df.index)

In [None]:
test_subset_merged1.isnull().sum()

In [None]:
df_test_pred['pred'] = reg.predict(test_subset_merged1)

In [None]:
sam = pd.read_csv('/kaggle/input/can-i-be-an-influencer/sample_submission.csv')
sam = sam.merge(df_test_pred, on = 'image_id',how = 'inner')
sam.drop('stats_downloads', axis = 1, inplace = True)
sam.columns = ['image_id', 'stats_downloads']

In [None]:
sam.to_csv('submission.csv', index = False)