In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from preprocessing import preprocess_text
from scipy.sparse import hstack


# df = pd.read_csv('profiles.csv')
# df.head()

In [17]:

data = pd.DataFrame(data={'text_feat':['This is my first sentence.','This is my second.'],
                        'text_feat2': ['Mjau Vau VAuvau Mjaumjau', 'Mjau Vau'],    
                        'numeric_feat':[1,2], 
                        'target':[3,4]})
X = data.loc[:,['text_feat','text_feat2', 'numeric_feat']]
y = data.loc[:,'target']

# first pipeline 
text_features = ['text_feat','text_feat2']
text_transformer = Pipeline(
        steps = [('vec', CountVectorizer())])

# wrap in ColumnTransformer
preprocessor = ColumnTransformer(
        transformers=[('text', text_transformer, 0)], remainder='passthrough')

# second pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# single pipeline works as expected
X_expected = text_transformer.fit_transform(X['text_feat'])

# but this fails
X_test = pipeline.fit_transform(X)

print('Expected:')
print(X_expected.toarray())
print(X_expected)
print('Got:')
print(X_test)
print(pd.DataFrame(X_test, columns = pipeline.named_steps['preprocessor'].get_feature_names_out()))
feat_names = preprocessor.get_feature_names_out()
print(feat_names)
print(pd.DataFrame(preprocessor.fit_transform(X),columns = preprocessor.get_feature_names_out()))

print(X[:1]['text_feat'])

Expected:
[[1 1 1 0 1 1]
 [0 1 1 1 0 1]]
  (0, 5)	1
  (0, 1)	1
  (0, 2)	1
  (0, 0)	1
  (0, 4)	1
  (1, 5)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
Got:
[[1 1 1 0 1 1 'Mjau Vau VAuvau Mjaumjau' 1]
 [0 1 1 1 0 1 'Mjau Vau' 2]]
  text__first text__is text__my text__second text__sentence text__this  \
0           1        1        1            0              1          1   
1           0        1        1            1              0          1   

      remainder__text_feat2 remainder__numeric_feat  
0  Mjau Vau VAuvau Mjaumjau                       1  
1                  Mjau Vau                       2  
['text__first' 'text__is' 'text__my' 'text__second' 'text__sentence'
 'text__this' 'remainder__text_feat2' 'remainder__numeric_feat']
  text__first text__is text__my text__second text__sentence text__this  \
0           1        1        1            0              1          1   
1           0        1        1            1              0          1   

      remainder__text_feat2 remainder__n