In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Final for category**

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
url = "/content/drive/MyDrive/Google Play Store/googleplaystore.csv"  # Replace with the actual URL or file path
df = pd.read_csv(url)

# Drop rows with NaN values in 'App', 'Category', 'Rating', or 'Installs' columns
df = df.dropna(subset=['App', 'Category', 'Rating', 'Installs'])

# Remove rows where 'App' or 'Category' columns contain numeric values
df = df[~df['App'].str.isnumeric() & ~df['Category'].str.isnumeric()]

# Remove rows with duplicate values in the 'App' column
df = df.drop_duplicates(subset=['App'])

# Convert 'Rating' column to numeric for filtering
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Remove rows where 'Rating' is greater than 5.0
df = df[df['Rating'] <= 5.0]

# Convert 'Size' column to numeric values in million
def convert_size_to_million(size):
    if 'M' in size:
        return int(float(size.replace('M', '')) * 1e6)
    elif 'k' in size:
        return int(float(size.replace('k', '')) * 1e3)
    else:
        return size

# Remove rows where 'Size' column contains 'Varies with device'
df = df[df['Size'] != 'Varies with device']

df['Size'] = df['Size'].apply(convert_size_to_million)

# Remove the '+' sign from 'Installs' column
df['Installs'] = df['Installs'].str.replace('+', '').str.replace(',', '')

# Sort the DataFrame by 'Rating' in descending order
sorted_df = df.sort_values(by='Rating', ascending=False)



# Extract the columns for normalization
columns_to_normalize = ['Rating', 'Size', 'Installs']
data_to_normalize = sorted_df[columns_to_normalize]

# Apply min-max normalization
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data_to_normalize)

# Replace the original columns with normalized values
sorted_df[columns_to_normalize] = normalized_data

# Display the 'App', 'Rating', 'Size', and 'Installs' columns with normalized values
result = sorted_df[['App', 'Rating', 'Size', 'Installs']]
print("Normalized Data:")
print(result)

# Describe the normalized columns
normalized_stats = result[columns_to_normalize].describe()
print("\nNormalized Columns Statistics:")
print(normalized_stats)



Normalized Data:
                                                    App  Rating      Size  \
7204          TI-84 CE Graphing Calculator Manual TI 84     1.0  0.269938   
5795                                   Axe Champs! Wars     1.0  0.249936   
8104                           Cy-Fair Christian Church     1.0  0.092923   
8102                          Cy-Fair VFD EMS Protocols     1.0  0.199932   
8063                            cx advance call blocker     1.0  0.033918   
...                                                 ...     ...       ...   
6490                                             MbH BM     0.0  0.022917   
10591  Lottery Ticket Checker - Florida Results & Lotto     0.0  0.409950   
4127                                  Speech Therapy: F     0.0  0.159929   
8820                                     DS Creator 2.0     0.0  0.043919   
7383                                     Thistletown CI     0.0  0.065921   

           Installs  
7204   9.900000e-08  
5795   4.90000

  df['Installs'] = df['Installs'].str.replace('+', '').str.replace(',', '')


In [None]:
import pandas as pd

def categorize_apps(df):
    # Set default category to 'C'
    df['Category'] = 'C'

    # Category A
    condition_a = (df['Rating'] >= 0.875) & (df['Size'] >= 0.309941) & (df['Installs'] >= 0.001)
    df.loc[condition_a, 'Category'] = 'A'

    # Category B
    condition_b = ((0.750 <= df['Rating']) & (df['Rating'] < 0.875)) & ((0.048919 <= df['Size']) & (df['Size'] < 0.309941)) & ((0.00001 <= df['Installs']) & (df['Installs'] < 0.001))
    df.loc[condition_b, 'Category'] = 'B'

    return df


df = pd.DataFrame(result)

# Assuming your dataset is named df
# Call the function to categorize apps
df = categorize_apps(df)

# Display or inspect the resulting DataFrame
print("DataFrame with 'Category' column:")
print(df[['App', 'Rating', 'Size', 'Installs', 'Category']])

# Save the final result to a CSV file
df.to_csv('/content/drive/MyDrive/Google Play Store/category.csv', index=False)


DataFrame with 'Category' column:
                                                    App  Rating      Size  \
7204          TI-84 CE Graphing Calculator Manual TI 84     1.0  0.269938   
5795                                   Axe Champs! Wars     1.0  0.249936   
8104                           Cy-Fair Christian Church     1.0  0.092923   
8102                          Cy-Fair VFD EMS Protocols     1.0  0.199932   
8063                            cx advance call blocker     1.0  0.033918   
...                                                 ...     ...       ...   
6490                                             MbH BM     0.0  0.022917   
10591  Lottery Ticket Checker - Florida Results & Lotto     0.0  0.409950   
4127                                  Speech Therapy: F     0.0  0.159929   
8820                                     DS Creator 2.0     0.0  0.043919   
7383                                     Thistletown CI     0.0  0.065921   

           Installs Category  
7204   9.9

In [None]:
# Display counts for each category
category_counts = df['Category'].value_counts()
print('\nCategory Counts:')
print(category_counts)


Category Counts:
C    6083
B     755
A     189
Name: Category, dtype: int64


# **Showing specific Apps**

In [None]:
import pandas as pd

# Load the dataset

df = pd.DataFrame(result)

# Drop rows with NaN values in 'App', 'Category', 'Rating', or 'Installs' columns
df = df.dropna(subset=['App', 'Rating', 'Installs'])

# Remove rows where 'App' or 'Category' columns contain numeric values
df = df[~df['App'].str.isnumeric()]


# Remove rows where 'Rating' is greater than 5.0
df = df[df['Rating'] <= 5.0]

# Remove rows where 'Size' column contains 'Varies with device'
df = df[df['Size'] != 'Varies with device']

# Remove rows with duplicate values in the 'App' column
df = df.drop_duplicates(subset=['App'])

# Specify the list of app names to keep
app_names_to_keep = [
    'English Grammar Complete Handbook',
    'Messages, Text and Video Chat for Messenger',
    'Hostelworld: Hostels & Cheap Hotels Travel App',
    'Helping BD',
    'IMDb Movies & TV',
    "McDonald's",
    'Lose Weight in 30 Days',
    'Temple Run 2',
    'PUBG MOBILE',
    'Amazon Shopping',
]

# Filter the DataFrame to include only the specified app names
result = df[df['App'].isin(app_names_to_keep)]

# Sort the DataFrame by 'Rating' in descending order
sorted_df = result.sort_values(by='Rating', ascending=False)

# Display the 'App', 'Category', 'Rating', 'Size', and 'Installs' columns
final_result = sorted_df[['App', 'Rating', 'Size', 'Installs']]

# Print the final result
print(final_result)
# Save the result to a new CSV file
result.to_csv('/content/drive/MyDrive/Google Play Store/specification.csv', index=False)

                                                 App  Rating      Size  \
6045                                      Helping BD   1.000  0.044919   
1261                          Lose Weight in 30 Days   0.950  0.109924   
150                English Grammar Complete Handbook   0.900  0.027917   
3113  Hostelworld: Hostels & Cheap Hotels Travel App   0.850  0.279939   
2547     Messages, Text and Video Chat for Messenger   0.850  0.039918   
1785                                     PUBG MOBILE   0.850  0.359946   
2664                                 Amazon Shopping   0.825  0.419951   
1661                                    Temple Run 2   0.825  0.619968   
888                                 IMDb Movies & TV   0.800  0.119925   
1176                                      McDonald's   0.650  0.419951   

          Installs  
6045  9.900000e-08  
1261  9.999999e-03  
150   4.999990e-04  
3113  9.999990e-04  
2547  9.999999e-03  
1785  5.000000e-02  
2664  1.000000e-01  
1661  5.000000e-0

In [None]:
df = pd.DataFrame(result)

# Specify the list of app names to keep
app_names_to_keep = [
    'English Grammar Complete Handbook',
    'Messages, Text and Video Chat for Messenger',
    'Hostelworld: Hostels & Cheap Hotels Travel App',
    'Helping BD',
    'IMDb Movies & TV',
    "McDonald's",
    'Lose Weight in 30 Days',
    'Temple Run 2',
    'PUBG MOBILE',
    'Amazon Shopping',
]

# Filter the DataFrame to include only the specified app names
result = df[df['App'].isin(app_names_to_keep)]

# Sort the DataFrame by 'Rating' in descending order
sorted_df = result.sort_values(by='Rating', ascending=False)

# Display the 'App', 'Category', 'Rating', 'Size', and 'Installs' columns
final_result = sorted_df[['App', 'Rating', 'Size', 'Installs']]

# Print the final result
print(final_result)
# Save the result to a new CSV file
result.to_csv('/content/drive/MyDrive/Google Play Store/2_specification.csv', index=False)

                                                 App  Rating      Size  \
6045                                      Helping BD   1.000  0.044919   
1261                          Lose Weight in 30 Days   0.950  0.109924   
150                English Grammar Complete Handbook   0.900  0.027917   
3113  Hostelworld: Hostels & Cheap Hotels Travel App   0.850  0.279939   
2547     Messages, Text and Video Chat for Messenger   0.850  0.039918   
1785                                     PUBG MOBILE   0.850  0.359946   
2664                                 Amazon Shopping   0.825  0.419951   
1661                                    Temple Run 2   0.825  0.619968   
888                                 IMDb Movies & TV   0.800  0.119925   
1176                                      McDonald's   0.650  0.419951   

          Installs  
6045  9.900000e-08  
1261  9.999999e-03  
150   4.999990e-04  
3113  9.999990e-04  
2547  9.999999e-03  
1785  5.000000e-02  
2664  1.000000e-01  
1661  5.000000e-0

# **Model**

In [None]:
import pandas as pd
df= pd.read_csv('/content/drive/MyDrive/Google Play Store/category.csv')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [None]:
#split dataset

cols = ['Rating','Size','Installs']
X = df[cols]
y = df['Category']

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
#Naive Bayes

model1 = GaussianNB()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred)*100)
print("Precision:", precision_score(y_test, y_pred, average='macro')*100)
print("Recall:", recall_score(y_test, y_pred, average='macro')*100)
print("F1 Score:", f1_score(y_test, y_pred, average='macro')*100)

Accuracy: 70.34718269778031
Precision: 53.97060224186233
Recall: 66.31399035432867
F1 Score: 52.30382225883656


In [None]:
#SVC
model2 = svm.SVC(kernel='linear')
model2 = model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred)*100)
print("Precision:", precision_score(y_test, y_pred, average='macro')*100)
print("Recall:", recall_score(y_test, y_pred, average='macro')*100)
print("F1 Score:", f1_score(y_test, y_pred, average='macro')*100)

Accuracy: 87.47865680136596
Precision: 29.159552267121985
Recall: 33.33333333333333
F1 Score: 31.107063347500507


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#KNN
model3 = KNeighborsClassifier(n_neighbors=5)
model3.fit(X_train, y_train)
y_pred = model3.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred)*100)
print("Precision:", precision_score(y_test, y_pred, average='macro')*100)
print("Recall:", recall_score(y_test, y_pred, average='macro')*100)
print("F1 Score:", f1_score(y_test, y_pred, average='macro')*100)

Accuracy: 89.2430278884462
Precision: 70.37732549374341
Recall: 72.12918395179182
F1 Score: 70.35834684385016
