In [85]:
import pandas as pd
from collections import Counter

# Load the data
file_path = '/content/outlier_handled.csv'  # Replace with the actual path
df = pd.read_csv(file_path)


# Prepare the amenities column by removing brackets and splitting into lists
df['amenities'] = df['amenities'].str.strip("[]").str.replace("'", "").str.split(", ")

# Flatten the list of amenities and count occurrences
all_amenities = [amenity for sublist in df['amenities'] for amenity in sublist]
amenity_counts = Counter(all_amenities)

# Get the top 7 most common amenities
top_7_amenities = [amenity for amenity, count in amenity_counts.most_common(7)]

# Create binary columns for each of the top 7 amenities
for amenity in top_7_amenities:
    df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)

# Optional: Drop the original amenities column if it's no longer needed
df.drop(columns=['amenities'], inplace=True)

# Display the resulting dataframe with binary columns for the top amenities
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9998 entries, 0 to 9997
Data columns (total 61 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            9998 non-null   int64  
 1   last_scraped                                  9998 non-null   object 
 2   name                                          9998 non-null   object 
 3   description                                   9998 non-null   object 
 4   host_name                                     9998 non-null   object 
 5   host_since                                    9998 non-null   object 
 6   host_location                                 9998 non-null   object 
 7   host_response_time                            9998 non-null   object 
 8   host_response_rate                            9998 non-null   float64
 9   host_acceptance_rate                          9998 non-null   f

In [73]:
df.shape

(9998, 61)

In [86]:
print(df['city'].unique())

['los_angeles' 'san_diego' 'san_francisco' 'santa_clara' 'seattle'
 'portland' 'san_mateo' 'oakland' 'santa_cruz' 'pacific_grove']


In [87]:
import ast
def parse_verifications(verifications):
    try:
        return ast.literal_eval(verifications)
    except ValueError:
        return []

# Apply the function to the 'host_verifications' column
df['host_verifications'] = df['host_verifications'].apply(parse_verifications)

# Create new columns and encode
df['has_email'] = df['host_verifications'].apply(lambda x: 1 if 'email' in x else 0)
df['has_phone'] = df['host_verifications'].apply(lambda x: 1 if 'phone' in x else 0)
df['has_work_email'] = df['host_verifications'].apply(lambda x: 1 if 'work_email' in x else 0)

# Display the resulting DataFrame
print(df)

                       id last_scraped  \
0     1000872389527520310   2024-09-04   
1                21216996   2024-09-05   
2      810908225191909546   2024-09-05   
3                39310458   2024-09-05   
4      691931631507059313   2024-09-05   
...                   ...          ...   
9993              3821616   2024-06-30   
9994              7292129   2024-06-30   
9995   910124947116022766   2024-06-30   
9996             38539935   2024-06-30   
9997              9155279   2024-06-30   

                                                   name  \
0                              #10 Modern condo 4B/2.5B   
1     Stairway to Los Angeles Views Room andprivate ...   
2                       Hollywood Hills House with View   
3     Private Long Beach Rental: 2Bd 1Bth, mins-LBG,...   
4            LUXURY BEVERLY VILLA!! - PRIME LOCATION!!!   
...                                                 ...   
9993       Bright Beach Cottage, Center of town, 30-day   
9994     ~Chestnut Cott

In [88]:
room_type_encoded = pd.get_dummies(df['room_type'], prefix='room_type')

# Step 2: Ensure all boolean-like values (if any) are converted to 0 and 1
room_type_encoded = room_type_encoded.astype(int)

# Step 3: Concatenate the encoded columns with the original DataFrame
df = pd.concat([df, room_type_encoded], axis=1)


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9998 entries, 0 to 9997
Data columns (total 68 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            9998 non-null   int64  
 1   last_scraped                                  9998 non-null   object 
 2   name                                          9998 non-null   object 
 3   description                                   9998 non-null   object 
 4   host_name                                     9998 non-null   object 
 5   host_since                                    9998 non-null   object 
 6   host_location                                 9998 non-null   object 
 7   host_response_time                            9998 non-null   object 
 8   host_response_rate                            9998 non-null   float64
 9   host_acceptance_rate                          9998 non-null   f

In [89]:

# Step 1: Calculate the frequency of each property type
property_type_counts = df['property_type'].value_counts()

# Display frequency counts (optional for analysis)
print("Property Type Frequency:\n", property_type_counts)

# Step 2: Choose top N frequent property types (e.g., top 5)
top_n = 8
top_property_types = property_type_counts.head(top_n).index.tolist()

# Step 3: Encode the top property types
for property_type in top_property_types:
    column_name = f'property_type_{property_type.replace(" ", "_")}'  # Clean column name
    df[column_name] = df['property_type'].apply(lambda x: 1 if x == property_type else 0)

# Step 4 (Optional): Create an "Other" category for less frequent property types
df['property_type_Other'] = df['property_type'].apply(lambda x: 1 if x not in top_property_types else 0)

# Display the resulting DataFrame
df.info()

Property Type Frequency:
 property_type
Entire home                2718
Entire rental unit         2324
Private room in home       1300
Entire condo                617
Entire guesthouse           571
                           ... 
Train                         1
Floor                         1
Private room in cottage       1
Shared room in bungalow       1
Entire chalet                 1
Name: count, Length: 75, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9998 entries, 0 to 9997
Data columns (total 77 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            9998 non-null   int64  
 1   last_scraped                                  9998 non-null   object 
 2   name                                          9998 non-null   object 
 3   description                                   9998 non-null   object 
 4   host_nam

In [90]:
city_encoded = pd.get_dummies(df['city'], prefix='city')
city_encoded = city_encoded.astype(int)

# Step 2: Concatenate the new columns with the original DataFrame
df = pd.concat([df, city_encoded], axis=1)

In [91]:
df['host_is_superhost']=df['host_is_superhost'].astype(int)
df['host_identity_verified']=df['host_identity_verified'].astype(int)


In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9998 entries, 0 to 9997
Data columns (total 87 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            9998 non-null   int64  
 1   last_scraped                                  9998 non-null   object 
 2   name                                          9998 non-null   object 
 3   description                                   9998 non-null   object 
 4   host_name                                     9998 non-null   object 
 5   host_since                                    9998 non-null   object 
 6   host_location                                 9998 non-null   object 
 7   host_response_time                            9998 non-null   object 
 8   host_response_rate                            9998 non-null   float64
 9   host_acceptance_rate                          9998 non-null   f

In [92]:
df.to_csv('feature_Extraction.csv', index=False)

In [93]:
#dropping all otheer column which I'm not using for model training
# List of features to keep
features_to_keep = [
    'host_is_superhost', 'host_identity_verified', 'latitude', 'longitude',
    'accommodates', 'bathrooms', 'beds', 'availability_365',
    '"Smoke alarm"', '"Wifi"', '"Carbon monoxide alarm"', '"Kitchen"', '"Essentials"',
    '"Hangers"', '"Hot water"', 'has_email', 'has_phone', 'has_work_email',
    'room_type_Entire home/apt', 'room_type_Hotel room', 'room_type_Private room',
    'room_type_Shared room', 'property_type_Entire_home',
    'property_type_Entire_rental_unit', 'property_type_Private_room_in_home',
    'property_type_Entire_condo', 'property_type_Entire_guesthouse',
    'property_type_Entire_guest_suite', 'property_type_Private_room_in_rental_unit',
    'property_type_Entire_townhouse', 'property_type_Other',
    'city_los_angeles', 'city_oakland', 'city_pacific_grove', 'city_portland',
    'city_san_diego', 'city_san_francisco', 'city_san_mateo', 'city_santa_clara',
    'city_santa_cruz', 'city_seattle','review_scores_rating','price_in_dollar'
    # Add all your review score rating columns here
    # Example: 'review_scores_rating', 'review_scores_accuracy', ...
]

# Select only the desired features
df_filtered = df[features_to_keep]
df_filtered.to_csv('data_needed_for_modeling.csv', index=False)
