In [1]:
# Tanner Tamondong
# November 28,2023
# Homework 3 

In [17]:
# import packages

%matplotlib inline

from pathlib import Path

from zipfile import ZipFile
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from dmba import classificationSummary

import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
import matplotlib.pylab as plt
from dmba import printTermDocumentMatrix, classificationSummary, liftChart

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tannertamondong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

------------------------------ Question #1 ------------------------------

In [18]:
# a) Load the zip file into Python and create a label vector.
corpus = []
label = []
with ZipFile('AutoAndElectronics-1.zip') as rawData:
    for info in rawData.infolist():
        if info.is_dir(): 
            continue
        label.append(1 if 'rec.autos' in info.filename else 0)
        corpus.append(rawData.read(info))

# b) Preprocess the documents. Explain what would be different if you did not perform the “stemming” step
        
class LemmaTokenizer(object):
    def __init__(self):
        self.stemmer = EnglishStemmer()
        self.stopWords = set(ENGLISH_STOP_WORDS)
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc) 
                if t.isalpha() and t not in self.stopWords]

preprocessor = CountVectorizer(tokenizer=LemmaTokenizer(), encoding='latin1')
preprocessedText = preprocessor.fit_transform(corpus)



b) In this example, it is essential to perform the stemming step on the processing of the data. The purpose of this step is to reduce words to their base or root form. It's used to reduce the dimensionality of the features and to group together words that have a common root. 

If this step was not done, the words wouldn't be 'stemmed' and we would have all the oribinal forms of the words. In a large data set like this, it could cause a variety of issues such as diminished computational efficiency, lack of retained variability, and overall detract from the accuracy of the model. 

In [19]:
# Select the first five rows from the data set
td = pd.DataFrame(preprocessedText.todense())
td.columns = preprocessor.get_feature_names_out()
term_document_matrix = td.T
term_document_matrix.columns = ['Sentence '+str(i) for i in range(1, td.shape[0]+1)]
term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)

#Top 25 words 
term_document_matrix = term_document_matrix.sort_values(by ='total_count',ascending=False)[:25] 

term_document_matrix

Unnamed: 0,Sentence 1,Sentence 2,Sentence 3,Sentence 4,Sentence 5,Sentence 6,Sentence 7,Sentence 8,Sentence 9,Sentence 10,...,Sentence 1992,Sentence 1993,Sentence 1994,Sentence 1995,Sentence 1996,Sentence 1997,Sentence 1998,Sentence 1999,Sentence 2000,total_count
line,1,1,1,1,2,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,2366
subject,2,1,1,2,1,1,1,1,1,1,...,1,1,1,1,2,1,2,2,1,2165
car,12,1,0,0,5,0,2,3,2,3,...,2,0,0,0,0,0,0,0,0,2120
apr,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,2,1,1,2086
newsgroup,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,2079
date,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,2,1,1,2056
path,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,2043
organ,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1998
gmt,1,1,1,1,1,1,1,0,1,1,...,1,1,1,1,1,1,2,1,1,1831
use,0,0,0,0,0,0,0,0,0,0,...,1,0,2,2,0,0,4,0,0,1727


In [20]:

# c) Use the LSA to create 10 concepts. Explain what is different about the concept matrix, as opposed to the TF-IDF 
tfidfTransformer = TfidfTransformer()
tfidf = tfidfTransformer.fit_transform(preprocessedText)

# Extract 10 concepts using LSA ()
svd = TruncatedSVD(10)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

lsa_tfidf = lsa.fit_transform(tfidf)

In [21]:
# Display the results 
print('\nLSA Concept Matrix:\n')
print(pd.DataFrame(lsa_tfidf, columns=["Concept {}".format(i + 1) for i in range(10)]))



LSA Concept Matrix:

      Concept 1  Concept 2  Concept 3  Concept 4  Concept 5  Concept 6  \
0      0.480357   0.514619  -0.062376   0.041445   0.099161  -0.076733   
1      0.918473   0.317244  -0.082756  -0.025805  -0.112051  -0.083195   
2      0.719741  -0.263794  -0.088829  -0.262327  -0.201097   0.258924   
3      0.867736   0.005920  -0.048982   0.034038   0.125596   0.290610   
4      0.739163   0.536430  -0.066118  -0.028693   0.155316   0.106956   
...         ...        ...        ...        ...        ...        ...   
1995   0.891558  -0.134650  -0.039089   0.070673  -0.121519  -0.298236   
1996   0.668983  -0.514686  -0.108491  -0.118475  -0.334139   0.108640   
1997   0.383993  -0.291713  -0.391121   0.708106   0.305224   0.018637   
1998   0.761240  -0.364453  -0.005540   0.003333  -0.336890  -0.313310   
1999   0.230611  -0.220864  -0.131090  -0.506279   0.748047  -0.156300   

      Concept 7  Concept 8  Concept 9  Concept 10  
0     -0.597379  -0.212434  -0.250732

c) When comparing the 10 LSA concepts and the TF-IDF matrix, we can generate different insights from each. 

In the TF-IDF matrix, we are looking at the frequency of the word in the document. It looks to see if its a commmon term throughout the data set, or just specifically in the text we're analyzing. A word that appears frequently in one document, but less in other documents, they are likely more important to that document, but maybe less to the data set

In the 10 LSA concepts, we look to identifies unerlying relationships between words and concepts in the set of documents. This matrix analyzes word freq across multiple documents to identify themes/topics, and groups them into 'Concepts.' This helps identify relationships between terms and concepts that may not be obvious. This allows us to analyze customer feedback, preferences, areas of improvement, etc. It can also help reduce noise in the data and filter through only useful reference points.  




In [22]:
# d) Predictive model 


x_train, x_test, y_train, y_test = train_test_split(lsa_tfidf, label, test_size= 0.4, random_state=1)

logit_reg = LogisticRegression(solver='lbfgs')
logit_reg.fit(x_train, y_train)

classificationSummary(y_test, logit_reg.predict(x_test))

Confusion Matrix (Accuracy 0.9575)

       Prediction
Actual   0   1
     0 383  14
     1  20 383


In [23]:
# Attempting kmeans clustering with the data. This model does have a low WCSS score, indicating that it is
# a strong predictive model, but the logistic regression used in class is likely more effective and useful.

kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(lsa_tfidf)


data_with_clusters = pd.DataFrame(lsa_tfidf, columns=["Concept {}".format(i + 1) for i in range(10)])
data_with_clusters['Cluster'] = clusters

print(f'\nWCSS: {kmeans.inertia_}\n')

print("Data with Clusters:")
print(data_with_clusters)

# Summarize cluster sizes
cluster_sizes = data_with_clusters['Cluster'].value_counts()
print("\nCluster Sizes:")
print(cluster_sizes)



  super()._check_params_vs_input(X, default_n_init=10)



WCSS: 584.5079118413041

Data with Clusters:
      Concept 1  Concept 2  Concept 3  Concept 4  Concept 5  Concept 6  \
0      0.480357   0.514619  -0.062376   0.041445   0.099161  -0.076733   
1      0.918473   0.317244  -0.082756  -0.025805  -0.112051  -0.083195   
2      0.719741  -0.263794  -0.088829  -0.262327  -0.201097   0.258924   
3      0.867736   0.005920  -0.048982   0.034038   0.125596   0.290610   
4      0.739163   0.536430  -0.066118  -0.028693   0.155316   0.106956   
...         ...        ...        ...        ...        ...        ...   
1995   0.891558  -0.134650  -0.039089   0.070673  -0.121519  -0.298236   
1996   0.668983  -0.514686  -0.108491  -0.118475  -0.334139   0.108640   
1997   0.383993  -0.291713  -0.391121   0.708106   0.305224   0.018637   
1998   0.761240  -0.364453  -0.005540   0.003333  -0.336890  -0.313310   
1999   0.230611  -0.220864  -0.131090  -0.506279   0.748047  -0.156300   

      Concept 7  Concept 8  Concept 9  Concept 10  Cluster  
0   

------------------------------ Question #2 ------------------------------

In [24]:
# Import auction data

auction_df = pd.read_csv('eBayAuctions.csv')

auction_df.head()

Unnamed: 0,Category,currency,sellerRating,Duration,endDay,ClosePrice,OpenPrice,Competitive?
0,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0
1,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0
2,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0
3,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0
4,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0


In [25]:
# Outcome variable = Competitive
# Tester variables = Category, currency, sellerRating, Duration, endDay, ClosePrice, OpenPrice

# Rename columns and create dummy variables for Category, Duration, Currency and endDay

auction_df = auction_df.rename(columns = {'Competitive?' : 'Competitive'})

columns_to_encode = ['Category', 'currency', 'endDay', 'Duration']
auction_df = pd.get_dummies(auction_df, columns = columns_to_encode, prefix = columns_to_encode)

auction_df.head()

Unnamed: 0,sellerRating,ClosePrice,OpenPrice,Competitive,Category_Antique/Art/Craft,Category_Automotive,Category_Books,Category_Business/Industrial,Category_Clothing/Accessories,Category_Coins/Stamps,...,endDay_Sat,endDay_Sun,endDay_Thu,endDay_Tue,endDay_Wed,Duration_1,Duration_3,Duration_5,Duration_7,Duration_10
0,3249,0.01,0.01,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,3249,0.01,0.01,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3249,0.01,0.01,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3249,0.01,0.01,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,3249,0.01,0.01,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [26]:
# Split data into training and validation

x = auction_df.drop(columns=['Competitive'])
y = auction_df['Competitive']


x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size = 0.40, random_state=1)

*** PART A ***


In [27]:
# a) Run a classification tree, using the default settings of DecisionTreeClassifier. What is the overall accuracy? 
# Single Tree with default settings

defaultTree = DecisionTreeClassifier(random_state=1)
defaultTree.fit(x_train, y_train)

classes = defaultTree.classes_
classificationSummary(y_valid, defaultTree.predict(x_valid), class_names=classes)

Confusion Matrix (Accuracy 0.8783)

       Prediction
Actual   0   1
     0 315  38
     1  58 378


a) In the default classification tree, we see an overall accuracy of 0.8783. This model accurately predicted the Competitve rating of 693 records of the 789 total records. It proves to be a relatively accurate model overall, but can be improved upon.

*** PART B ***

In [28]:
# b) Run a boosted tree with the same predictors (use AdaBoostClassifier with DecisionTreeClassifier as the base estimator). For the validation set, what is the overall accuracy?
# Boosted Tree

boostTree = AdaBoostClassifier(DecisionTreeClassifier(random_state=1), n_estimators=100, random_state=1)
boostTree.fit(x_train, y_train)

classificationSummary(y_valid, boostTree.predict(x_valid), class_names = classes)


Confusion Matrix (Accuracy 0.8682)

       Prediction
Actual   0   1
     0 313  40
     1  64 372


b) In the boosted tree, the overall accuracy is slightly lower than the default tree, at 0.8695. 

*** PART C ***

In [29]:
# Run a bagged tree with the same predictors (use BaggingClassifier). For the validation set, what is the overall accuracy?
# Bagging Tree

baggingTree = BaggingClassifier(DecisionTreeClassifier(random_state=1), n_estimators=100, random_state=1)
baggingTree.fit(x_train,y_train)

classificationSummary(y_valid, baggingTree.predict(x_valid), class_names = classes)

Confusion Matrix (Accuracy 0.8973)

       Prediction
Actual   0   1
     0 334  19
     1  62 374


c) In the bagging tree, the overall accuracy is 0.8973. This proves to be the most accurate tree based on the validation data. It successfully predicted the competitive score of 708 of the 789 records. The bagging tree proves to be the most accurate and useful in this example. 