In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("mall_customers.csv")
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [3]:
df.rename(columns={
    "Annual Income (k$)": "income",
    "Spending Score (1-100)": "Score",
    
},
          inplace=True)

In [4]:
df.head()

Unnamed: 0,CustomerID,Gender,Age,income,Score
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [5]:
df.shape

(200, 5)

In [6]:
dummies = pd.get_dummies(df.Gender)
dummies

Unnamed: 0,Female,Male
0,0,1
1,0,1
2,1,0
3,1,0
4,1,0
...,...,...
195,1,0
196,1,0
197,0,1
198,0,1


In [7]:
df1=pd.concat([df,dummies],axis="columns")

In [8]:
df1.head()

Unnamed: 0,CustomerID,Gender,Age,income,Score,Female,Male
0,1,Male,19,15,39,0,1
1,2,Male,21,15,81,0,1
2,3,Female,20,16,6,1,0
3,4,Female,23,16,77,1,0
4,5,Female,31,17,40,1,0


In [9]:
df2=df1.drop(["Gender","Female"],axis="columns")

In [10]:
df2.head()

Unnamed: 0,CustomerID,Age,income,Score,Male
0,1,19,15,39,1
1,2,21,15,81,1
2,3,20,16,6,0
3,4,23,16,77,0
4,5,31,17,40,0


In [11]:
df2=df2[["CustomerID","Age","income","Male","Score"]]

In [12]:
df2

Unnamed: 0,CustomerID,Age,income,Male,Score
0,1,19,15,1,39
1,2,21,15,1,81
2,3,20,16,0,6
3,4,23,16,0,77
4,5,31,17,0,40
...,...,...,...,...,...
195,196,35,120,0,79
196,197,45,126,0,28
197,198,32,126,1,74
198,199,32,137,1,18


In [13]:
df2.isnull().sum()

CustomerID    0
Age           0
income        0
Male          0
Score         0
dtype: int64

In [14]:
df2.describe()

Unnamed: 0,CustomerID,Age,income,Male,Score
count,200.0,200.0,200.0,200.0,200.0
mean,100.5,38.85,60.56,0.44,50.2
std,57.879185,13.969007,26.264721,0.497633,25.823522
min,1.0,18.0,15.0,0.0,1.0
25%,50.75,28.75,41.5,0.0,34.75
50%,100.5,36.0,61.5,0.0,50.0
75%,150.25,49.0,78.0,1.0,73.0
max,200.0,70.0,137.0,1.0,99.0


In [15]:
lower_limit, upper_limit = df2.Score.quantile([0.001, 0.999])
lower_limit, upper_limit


(1.0, 98.80100000000002)

In [16]:
outliers = df2[(df2.Score>upper_limit) | (df2.Score<lower_limit)]
outliers

Unnamed: 0,CustomerID,Age,income,Male,Score
11,12,35,19,0,99


In [17]:
df3 = df2[(df2.Score<upper_limit) & (df2.Score>lower_limit)]
df3.shape

(197, 5)

In [18]:
df2.shape

(200, 5)

max_limit = df3.Score.mean() + 4*df3.Score.std()
min_limit = df3.Score.mean() - 4*df3.Score.std()
max_limit, min_limit

df3[(df3.Score>max_limit) | (df3.Score<min_limit)]

In [33]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [34]:
X=df3.drop(["Score"],axis='columns')
y=df3.Score

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [36]:
model = SVC(kernel='rbf',C=10,gamma='auto')
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.016666666666666666

In [39]:
model1 = RandomForestClassifier(n_estimators=20)
model1.fit(X_train,y_train)
model1.score(X_test, y_test)

0.016666666666666666

In [25]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}

In [26]:
from sklearn.model_selection import GridSearchCV

scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=4, return_train_score=False)
    clf.fit(X,y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df4= pd.DataFrame(scores,columns=['model','best_score','best_params'])
df4



Unnamed: 0,model,best_score,best_params
0,svm,0.045612,"{'C': 10, 'kernel': 'rbf'}"
1,random_forest,0.045816,{'n_estimators': 10}
2,logistic_regression,0.050816,{'C': 5}
3,naive_bayes_gaussian,0.04051,{}
4,naive_bayes_multinomial,0.005,{}
5,decision_tree,0.045612,{'criterion': 'entropy'}
